From beaa1ffe551c330d8ea23de158432ecaad6c0410 Mon Sep 17 00:00:00 2001 From: Yunying Sun Date: Wed, 16 Nov 2022 16:22:21 +0800 Subject: [PATCH 01/30] clocksource: Print clocksource name when clocksource is tested unstable Some "TSC fall back to HPET" messages appear on systems having more than 2 NUMA nodes: clocksource: timekeeping watchdog on CPU168: hpet read-back delay of 4296200ns, attempt 4, marking unstable The "hpet" here is misleading the clocksource watchdog is really doing repeated reads of "hpet" in order to check for unrelated delays. Therefore, print the name of the clocksource under test, prefixed by "wd-" and suffixed by "-wd", for example, "wd-tsc-wd". Signed-off-by: Yunying Sun Signed-off-by: Paul E. McKenney --- kernel/time/clocksource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9cf32ccda715..4a2c3bb92e2e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -257,8 +257,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, goto skip_test; } - pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n", - smp_processor_id(), watchdog->name, wd_delay, nretries); + pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd read-back delay of %lldns, attempt %d, marking unstable\n", + smp_processor_id(), cs->name, wd_delay, nretries); return WD_READ_UNSTABLE; skip_test: From c37e85c135cead4256dc8860073c468d8925c3df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Dec 2022 19:36:10 -0800 Subject: [PATCH 02/30] clocksource: Loosen clocksource watchdog constraints Currently, MAX_SKEW_USEC is set to 100 microseconds, which has worked reasonably well. However, NTP is willing to tolerate 500 microseconds of skew per second, and a clocksource that is good enough for NTP should be good enough for the clocksource watchdog. The watchdog's skew is controlled by MAX_SKEW_USEC and the CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option. However, these values are doubled before being associated with a clocksource's ->uncertainty_margin, and the ->uncertainty_margin values of the pair of clocksource's being compared are summed before checking against the skew. Therefore, set both MAX_SKEW_USEC and the default for the CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option to 125 microseconds of skew per second, resulting in 500 microseconds of skew per second in the clocksource watchdog's skew comparison. Suggested-by Rik van Riel Signed-off-by: Paul E. McKenney --- kernel/time/Kconfig | 6 +++++- kernel/time/clocksource.c | 15 +++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a41753be1a2b..bae8f11070be 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US int "Clocksource watchdog maximum allowable skew (in μs)" depends on CLOCKSOURCE_WATCHDOG range 50 1000 - default 100 + default 125 help Specify the maximum amount of allowable watchdog skew in microseconds before reporting the clocksource to be unstable. + The default is based on a half-second clocksource watchdog + interval and NTP's maximum frequency drift of 500 parts + per million. If the clocksource is good enough for NTP, + it is good enough for the clocksource watchdog! endmenu endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 4a2c3bb92e2e..a3d19f6660ac 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -95,6 +95,11 @@ static char override_name[CS_NAME_LEN]; static int finished_booting; static u64 suspend_start; +/* + * Interval: 0.5sec. + */ +#define WATCHDOG_INTERVAL (HZ >> 1) + /* * Threshold: 0.0312s, when doubled: 0.0625s. * Also a default for cs->uncertainty_margin when registering clocks. @@ -106,11 +111,14 @@ static u64 suspend_start; * clocksource surrounding a read of the clocksource being validated. * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as * a lower bound for cs->uncertainty_margin values when registering clocks. + * + * The default of 500 parts per million is based on NTP's limits. + * If a clocksource is good enough for NTP, it is good enough for us! */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US #define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US #else -#define MAX_SKEW_USEC 100 +#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) #endif #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) @@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags) static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); -/* - * Interval: 0.5sec. - */ -#define WATCHDOG_INTERVAL (HZ >> 1) - static void clocksource_watchdog_work(struct work_struct *work) { /* From f092eb34b33043152bfb8a4ca01db9a06728261d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 13 Dec 2022 13:57:28 -0800 Subject: [PATCH 03/30] clocksource: Improve read-back-delay message When cs_watchdog_read() is unable to get a qualifying clocksource read within the limit set by max_cswd_read_retries, it prints a message and marks the clocksource under test as unstable. But that message is unclear to anyone unfamiliar with the code: clocksource: timekeeping watchdog on CPU13: wd-tsc-wd read-back delay 1000614ns, attempt 3, marking unstable Therefore, add some context so that the message appears as follows: clocksource: timekeeping watchdog on CPU13: wd-tsc-wd excessive read-back delay of 1000614ns vs. limit of 125000ns, wd-wd read-back delay only 27ns, attempt 3, marking tsc unstable Signed-off-by: Paul E. McKenney Cc: John Stultz Cc: Thomas Gleixner Cc: Stephen Boyd Cc: Feng Tang --- kernel/time/clocksource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a3d19f6660ac..b59914953809 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -260,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, goto skip_test; } - pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd read-back delay of %lldns, attempt %d, marking unstable\n", - smp_processor_id(), cs->name, wd_delay, nretries); + pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", + smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name); return WD_READ_UNSTABLE; skip_test: From dd029269947a32047b8ce1f8513b0b3b13f0df32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 13 Dec 2022 16:42:15 -0800 Subject: [PATCH 04/30] clocksource: Improve "skew is too large" messages When clocksource_watchdog() detects excessive clocksource skew compared to the watchdog clocksource, it marks the clocksource under test as unstable and prints several lines worth of message. But that message is unclear to anyone unfamiliar with the code: clocksource: timekeeping watchdog on CPU2: Marking clocksource 'wdtest-ktime' as unstable because the skew is too large: clocksource: 'kvm-clock' wd_nsec: 400744390 wd_now: 612625c2c wd_last: 5fa7f7c66 mask: ffffffffffffffff clocksource: 'wdtest-ktime' cs_nsec: 600744034 cs_now: 173081397a292d4f cs_last: 17308139565a8ced mask: ffffffffffffffff clocksource: 'kvm-clock' (not 'wdtest-ktime') is current clocksource. Therefore, add the following line near the end of that message: Clocksource 'wdtest-ktime' skewed 199999644 ns (199 ms) over watchdog 'kvm-clock' interval of 400744390 ns (400 ms) This new line clearly indicates the amount of skew between the two clocksources, along with the duration of the time interval over which the skew occurred, both in nanoseconds and milliseconds. Signed-off-by: Paul E. McKenney Cc: John Stultz Cc: Thomas Gleixner Cc: Stephen Boyd Cc: Feng Tang --- kernel/time/clocksource.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b59914953809..fc486cd97263 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -446,12 +446,20 @@ static void clocksource_watchdog(struct timer_list *unused) /* Check the deviation from the watchdog clocksource. */ md = cs->uncertainty_margin + watchdog->uncertainty_margin; if (abs(cs_nsec - wd_nsec) > md) { + u64 cs_wd_msec; + u64 wd_msec; + u32 wd_rem; + pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", smp_processor_id(), cs->name); pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", cs->name, cs_nsec, csnow, cslast, cs->mask); + cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem); + wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem); + pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", + cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); if (curr_clocksource == cs) pr_warn(" '%s' is current clocksource.\n", cs->name); else if (curr_clocksource) From 9f76d59173d9d146e96c66886b671c1915a5c5e5 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 5 Jan 2023 14:44:03 +0100 Subject: [PATCH 05/30] timers: Prevent union confusion from unexpected restart_syscall() The nanosleep syscalls use the restart_block mechanism, with a quirk: The `type` and `rmtp`/`compat_rmtp` fields are set up unconditionally on syscall entry, while the rest of the restart_block is only set up in the unlikely case that the syscall is actually interrupted by a signal (or pseudo-signal) that doesn't have a signal handler. If the restart_block was set up by a previous syscall (futex(..., FUTEX_WAIT, ...) or poll()) and hasn't been invalidated somehow since then, this will clobber some of the union fields used by futex_wait_restart() and do_restart_poll(). If userspace afterwards wrongly calls the restart_syscall syscall, futex_wait_restart()/do_restart_poll() will read struct fields that have been clobbered. This doesn't actually lead to anything particularly interesting because none of the union fields contain trusted kernel data, and futex(..., FUTEX_WAIT, ...) and poll() aren't syscalls where it makes much sense to apply seccomp filters to their arguments. So the current consequences are just of the "if userspace does bad stuff, it can damage itself, and that's not a problem" flavor. But still, it seems like a hazard for future developers, so invalidate the restart_block when partly setting it up in the nanosleep syscalls. Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230105134403.754986-1-jannh@google.com --- kernel/time/hrtimer.c | 2 ++ kernel/time/posix-stubs.c | 2 ++ kernel/time/posix-timers.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3ae661ab6260..e4f0e3b0c4f4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2126,6 +2126,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, @@ -2147,6 +2148,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 90ea5f373e50..828aeecbd1e8 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -147,6 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; texp = timespec64_to_ktime(t); @@ -240,6 +241,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; texp = timespec64_to_ktime(t); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 5dead89308b7..0c8a87a11b39 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1270,6 +1270,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; @@ -1297,6 +1298,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; From b7082cdfc464bf9231300605d03eebf943dda307 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 20 Dec 2022 16:25:12 +0800 Subject: [PATCH 06/30] clocksource: Suspend the watchdog temporarily when high read latency detected Bugs have been reported on 8 sockets x86 machines in which the TSC was wrongly disabled when the system is under heavy workload. [ 818.380354] clocksource: timekeeping watchdog on CPU336: hpet wd-wd read-back delay of 1203520ns [ 818.436160] clocksource: wd-tsc-wd read-back delay of 181880ns, clock-skew test skipped! [ 819.402962] clocksource: timekeeping watchdog on CPU338: hpet wd-wd read-back delay of 324000ns [ 819.448036] clocksource: wd-tsc-wd read-back delay of 337240ns, clock-skew test skipped! [ 819.880863] clocksource: timekeeping watchdog on CPU339: hpet read-back delay of 150280ns, attempt 3, marking unstable [ 819.936243] tsc: Marking TSC unstable due to clocksource watchdog [ 820.068173] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'. [ 820.092382] sched_clock: Marking unstable (818769414384, 1195404998) [ 820.643627] clocksource: Checking clocksource tsc synchronization from CPU 267 to CPUs 0,4,25,70,126,430,557,564. [ 821.067990] clocksource: Switched to clocksource hpet This can be reproduced by running memory intensive 'stream' tests, or some of the stress-ng subcases such as 'ioport'. The reason for these issues is the when system is under heavy load, the read latency of the clocksources can be very high. Even lightweight TSC reads can show high latencies, and latencies are much worse for external clocksources such as HPET or the APIC PM timer. These latencies can result in false-positive clocksource-unstable determinations. These issues were initially reported by a customer running on a production system, and this problem was reproduced on several generations of Xeon servers, especially when running the stress-ng test. These Xeon servers were not production systems, but they did have the latest steppings and firmware. Given that the clocksource watchdog is a continual diagnostic check with frequency of twice a second, there is no need to rush it when the system is under heavy load. Therefore, when high clocksource read latencies are detected, suspend the watchdog timer for 5 minutes. Signed-off-by: Feng Tang Acked-by: Waiman Long Cc: John Stultz Cc: Thomas Gleixner Cc: Stephen Boyd Cc: Feng Tang Signed-off-by: Paul E. McKenney --- kernel/time/clocksource.c | 45 ++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index fc486cd97263..91836b727cef 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -387,6 +387,15 @@ void clocksource_verify_percpu(struct clocksource *cs) } EXPORT_SYMBOL_GPL(clocksource_verify_percpu); +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + + static void clocksource_watchdog(struct timer_list *unused) { u64 csnow, wdnow, cslast, wdlast, delta; @@ -394,6 +403,7 @@ static void clocksource_watchdog(struct timer_list *unused) int64_t wd_nsec, cs_nsec; struct clocksource *cs; enum wd_read_status read_ret; + unsigned long extra_wait = 0; u32 md; spin_lock(&watchdog_lock); @@ -413,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused) read_ret = cs_watchdog_read(cs, &csnow, &wdnow); - if (read_ret != WD_READ_SUCCESS) { - if (read_ret == WD_READ_UNSTABLE) - /* Clock readout unreliable, so give it up. */ - __clocksource_unstable(cs); + if (read_ret == WD_READ_UNSTABLE) { + /* Clock readout unreliable, so give it up. */ + __clocksource_unstable(cs); continue; } + /* + * When WD_READ_SKIP is returned, it means the system is likely + * under very heavy load, where the latency of reading + * watchdog/clocksource is very big, and affect the accuracy of + * watchdog check. So give system some space and suspend the + * watchdog check for 5 minutes. + */ + if (read_ret == WD_READ_SKIP) { + /* + * As the watchdog timer will be suspended, and + * cs->last could keep unchanged for 5 minutes, reset + * the counters. + */ + clocksource_reset_watchdog(); + extra_wait = HZ * 300; + break; + } + /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || atomic_read(&watchdog_reset_pending)) { @@ -523,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused) * pair clocksource_stop_watchdog() clocksource_start_watchdog(). */ if (!timer_pending(&watchdog_timer)) { - watchdog_timer.expires += WATCHDOG_INTERVAL; + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait; add_timer_on(&watchdog_timer, next_cpu); } out: @@ -548,14 +575,6 @@ static inline void clocksource_stop_watchdog(void) watchdog_running = 0; } -static inline void clocksource_reset_watchdog(void) -{ - struct clocksource *cs; - - list_for_each_entry(cs, &watchdog_list, wd_list) - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} - static void clocksource_resume_watchdog(void) { atomic_inc(&watchdog_reset_pending); From c14fd3dcacaa480394d3ac0b4a91a7d17a4b5516 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 23 Jan 2023 09:32:05 -0800 Subject: [PATCH 07/30] hrtimer: Rely on rt_task() for DL tasks too Checking dl_task() is redundant as rt_task() returns true for deadline tasks too. Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230123173206.6764-2-dave@stgolabs.net --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e4f0e3b0c4f4..667b713bab42 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2089,7 +2089,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, u64 slack; slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) + if (rt_task(current)) slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); From 0c52310f260014d95c1310364379772cb74cf82d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 23 Jan 2023 09:32:06 -0800 Subject: [PATCH 08/30] hrtimer: Ignore slack time for RT tasks in schedule_hrtimeout_range() While in theory the timer can be triggered before expires + delta, for the cases of RT tasks they really have no business giving any lenience for extra slack time, so override any passed value by the user and always use zero for schedule_hrtimeout_range() calls. Furthermore, this is similar to what the nanosleep(2) family already does with current->timer_slack_ns. Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230123173206.6764-3-dave@stgolabs.net --- kernel/time/hrtimer.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 667b713bab42..e8c08292defc 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2272,7 +2272,7 @@ void __init hrtimers_init(void) /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) + * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * @clock_id: timer clock to be used */ @@ -2299,6 +2299,13 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } + /* + * Override any slack passed by the user if under + * rt contraints. + */ + if (rt_task(current)) + delta = 0; + hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); @@ -2318,7 +2325,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) + * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * * Make the current task sleep until the given expiry time has @@ -2326,7 +2333,8 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly. + * actual wakeup to a time that is both power and performance friendly + * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * From cbdb1f163af2bb90d01be1f0263df1d8d5c9d9d3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 28 Nov 2022 16:10:03 +0200 Subject: [PATCH 09/30] vdso/bits.h: Add BIT_ULL() for the sake of consistency The minimization done in 3945ff37d2f4 ("linux/bits.h: Extract common header for vDSO") was required to isolate the VDSO build from the larger kernel header impact. The split added some inconsistency since BIT() and BIT_ULL() are now defined in the different files which confuses unprepared reader. Move BIT_ULL() to vdso/bits.h. No functional change. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221128141003.77929-1-andriy.shevchenko@linux.intel.com --- include/linux/bits.h | 1 - include/vdso/bits.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bits.h b/include/linux/bits.h index 87d112650dfb..7c0cf5031abe 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -6,7 +6,6 @@ #include #include -#define BIT_ULL(nr) (ULL(1) << (nr)) #define BIT_MASK(nr) (UL(1) << ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) #define BIT_ULL_MASK(nr) (ULL(1) << ((nr) % BITS_PER_LONG_LONG)) diff --git a/include/vdso/bits.h b/include/vdso/bits.h index 6d005a1f5d94..388b212088ea 100644 --- a/include/vdso/bits.h +++ b/include/vdso/bits.h @@ -5,5 +5,6 @@ #include #define BIT(nr) (UL(1) << (nr)) +#define BIT_ULL(nr) (ULL(1) << (nr)) #endif /* __VDSO_BITS_H */ From a7ec817d55421ac214cac9d3e5ebb65d848198dc Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 4 Jan 2023 16:19:38 +0800 Subject: [PATCH 10/30] x86/tsc: Add option to force frequency recalibration with HW timer The kernel assumes that the TSC frequency which is provided by the hardware / firmware via MSRs or CPUID(0x15) is correct after applying a few basic consistency checks. This disables the TSC recalibration against HPET or PM timer. As a result there is no mechanism to validate that frequency in cases where a firmware or hardware defect is suspected. And there was case that some user used atomic clock to measure the TSC frequency and reported an inaccuracy issue, which was later fixed in firmware. Add an option 'recalibrate' for 'tsc' kernel parameter to force the tsc freq recalibration with HPET or PM timer, and warn if the deviation from previous value is more than about 500 PPM, which provides a way to verify the data from hardware / firmware. There is no functional change to existing work flow. Recently there was a real-world case: "The 40ms/s divergence between TSC and HPET was observed on hardware that is quite recent" [1], on that platform the TSC frequence 1896 MHz was got from CPUID(0x15), and the force-reclibration with HPET/PMTIMER both calibrated out value of 1975 MHz, which also matched with check from software 'chronyd', indicating it's a problem of BIOS or firmware. [Thanks tglx for helping improving the commit log] [ paulmck: Wordsmith Kconfig help text. ] [1]. https://lore.kernel.org/lkml/20221117230910.GI4001@paulmck-ThinkPad-P17-Gen-1/ Signed-off-by: Feng Tang Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Jonathan Corbet Cc: Cc: Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 4 +++ arch/x86/kernel/tsc.c | 34 ++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6cfa6e3996cf..95f0d104c232 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6369,6 +6369,10 @@ in situations with strict latency requirements (where interruptions from clocksource watchdog are not acceptable). + [x86] recalibrate: force recalibration against a HW timer + (HPET or PM timer) on systems whose TSC frequency was + obtained from HW or FW using either an MSR or CPUID(0x15). + Warn if the difference is more than 500 ppm. tsc_early_khz= [X86] Skip early TSC calibration and use the given value instead. Useful when the early TSC frequency discovery diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a78e73da4a74..92bbc4a6b3fc 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -48,6 +48,8 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; +static int __read_mostly tsc_force_recalibrate; + static u32 art_to_tsc_numerator; static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; @@ -303,6 +305,8 @@ static int __init tsc_setup(char *str) mark_tsc_unstable("boot parameter"); if (!strcmp(str, "nowatchdog")) no_tsc_watchdog = 1; + if (!strcmp(str, "recalibrate")) + tsc_force_recalibrate = 1; return 1; } @@ -1374,6 +1378,25 @@ restart: else freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + /* Will hit this only if tsc_force_recalibrate has been set */ + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + + /* Warn if the deviation exceeds 500 ppm */ + if (abs(tsc_khz - freq) > (tsc_khz >> 11)) { + pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n"); + pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + } + + pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n", + hpet ? "HPET" : "PM_TIMER", + (unsigned long)freq / 1000, + (unsigned long)freq % 1000); + + return; + } + /* Make sure we're within 1% */ if (abs(tsc_khz - freq) > tsc_khz/100) goto out; @@ -1407,8 +1430,10 @@ static int __init init_tsc_clocksource(void) if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; - if (tsc_unstable) - goto unreg; + if (tsc_unstable) { + clocksource_unregister(&clocksource_tsc_early); + return 0; + } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; @@ -1421,9 +1446,10 @@ static int __init init_tsc_clocksource(void) if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); -unreg: clocksource_unregister(&clocksource_tsc_early); - return 0; + + if (!tsc_force_recalibrate) + return 0; } schedule_delayed_work(&tsc_irqwork, 0); From efc8b329c7fdc30921a7dfc109237523e1e5b1cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Dec 2022 16:20:25 -0800 Subject: [PATCH 11/30] clocksource: Verify HPET and PMTMR when TSC unverified On systems with two or fewer sockets, when the boot CPU has CONSTANT_TSC, NONSTOP_TSC, and TSC_ADJUST, clocksource watchdog verification of the TSC is disabled. This works well much of the time, but there is the occasional production-level system that meets all of these criteria, but which still has a TSC that skews significantly from atomic-clock time. This is usually attributed to a firmware or hardware fault. Yes, the various NTP daemons do express their opinions of userspace-to-atomic-clock time skew, but they put them in various places, depending on the daemon and distro in question. It would therefore be good for the kernel to have some clue that there is a problem. The old behavior of marking the TSC unstable is a non-starter because a great many workloads simply cannot tolerate the overheads and latencies of the various non-TSC clocksources. In addition, NTP-corrected systems sometimes can tolerate significant kernel-space time skew as long as the userspace time sources are within epsilon of atomic-clock time. Therefore, when watchdog verification of TSC is disabled, enable it for HPET and PMTMR (AKA ACPI PM timer). This provides the needed in-kernel time-skew diagnostic without degrading the system's performance. Signed-off-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Daniel Lezcano Cc: Waiman Long Cc: Tested-by: Feng Tang --- arch/x86/include/asm/time.h | 1 + arch/x86/kernel/hpet.c | 2 ++ arch/x86/kernel/tsc.c | 5 +++++ drivers/clocksource/acpi_pm.c | 6 ++++-- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 8ac563abb567..a53961c64a56 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -8,6 +8,7 @@ extern void hpet_time_init(void); extern void time_init(void); extern bool pit_timer_init(void); +extern bool tsc_clocksource_watchdog_disabled(void); extern struct clock_event_device *global_clock_event; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 71f336425e58..c8eb1ac5125a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1091,6 +1091,8 @@ int __init hpet_enable(void) if (!hpet_counting()) goto out_nohpet; + if (tsc_clocksource_watchdog_disabled()) + clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY; clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); if (id & HPET_ID_LEGSUP) { diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 92bbc4a6b3fc..a5371c6d4b64 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1190,6 +1190,11 @@ static void __init tsc_disable_clocksource_watchdog(void) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } +bool tsc_clocksource_watchdog_disabled(void) +{ + return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY); +} + static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index 279ddff81ab4..82338773602c 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -23,6 +23,7 @@ #include #include #include +#include /* * The I/O port the PMTMR resides at. @@ -210,8 +211,9 @@ static int __init init_acpi_pm_clocksource(void) return -ENODEV; } - return clocksource_register_hz(&clocksource_acpi_pm, - PMTMR_TICKS_PER_SEC); + if (tsc_clocksource_watchdog_disabled()) + clocksource_acpi_pm.flags |= CLOCK_SOURCE_MUST_VERIFY; + return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC); } /* We use fs_initcall because we want the PCI fixups to have run From 915d4ad3830aa1a2dafda9b737749fb410cb9790 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 16 Jan 2023 17:53:37 +0100 Subject: [PATCH 12/30] posix-timers: Use atomic64_try_cmpxchg() in __update_gt_cputime() Use atomic64_try_cmpxchg() instead of atomic64_cmpxchg() in __update_gt_cputime(). The x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg() (and related move instruction in front of cmpxchg()). Also, atomic64_try_cmpxchg() implicitly assigns old *ptr value to "old" when cmpxchg() fails. There is no need to re-read the value in the loop. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230116165337.5810-1-ubizjak@gmail.com --- kernel/time/posix-cpu-timers.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cb925e8ef9a8..2f5e9b34022c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -243,13 +243,12 @@ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, */ static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) { - u64 curr_cputime; -retry: - curr_cputime = atomic64_read(cputime); - if (sum_cputime > curr_cputime) { - if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime) - goto retry; - } + u64 curr_cputime = atomic64_read(cputime); + + do { + if (sum_cputime <= curr_cputime) + return; + } while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime)); } static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, From 0051293c533017e2a860e0a0a33517bc40240fff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Feb 2023 13:53:07 -0800 Subject: [PATCH 13/30] clocksource: Enable TSC watchdog checking of HPET and PMTMR only when requested Unconditionally enabling TSC watchdog checking of the HPET and PMTMR clocksources can degrade latency and performance. Therefore, provide a new "watchdog" option to the tsc= boot parameter that opts into such checking. Note that tsc=watchdog is overridden by a tsc=nowatchdog regardless of their relative positions in the list of boot parameters. Reported-by: Thomas Gleixner Reported-by: Waiman Long Signed-off-by: Paul E. McKenney Acked-by: Waiman Long --- .../admin-guide/kernel-parameters.txt | 6 ++++++ arch/x86/kernel/tsc.c | 18 ++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 95f0d104c232..7b4df6d89d3c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6373,6 +6373,12 @@ (HPET or PM timer) on systems whose TSC frequency was obtained from HW or FW using either an MSR or CPUID(0x15). Warn if the difference is more than 500 ppm. + [x86] watchdog: Use TSC as the watchdog clocksource with + which to check other HW timers (HPET or PM timer), but + only on systems where TSC has been deemed trustworthy. + This will be suppressed by an earlier tsc=nowatchdog and + can be overridden by a later tsc=nowatchdog. A console + message will flag any such suppression or overriding. tsc_early_khz= [X86] Skip early TSC calibration and use the given value instead. Useful when the early TSC frequency discovery diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a5371c6d4b64..306c233c98d8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -294,6 +294,7 @@ __setup("notsc", notsc_setup); static int no_sched_irq_time; static int no_tsc_watchdog; +static int tsc_as_watchdog; static int __init tsc_setup(char *str) { @@ -303,10 +304,22 @@ static int __init tsc_setup(char *str) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); - if (!strcmp(str, "nowatchdog")) + if (!strcmp(str, "nowatchdog")) { no_tsc_watchdog = 1; + if (tsc_as_watchdog) + pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", + __func__); + tsc_as_watchdog = 0; + } if (!strcmp(str, "recalibrate")) tsc_force_recalibrate = 1; + if (!strcmp(str, "watchdog")) { + if (no_tsc_watchdog) + pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", + __func__); + else + tsc_as_watchdog = 1; + } return 1; } @@ -1192,7 +1205,8 @@ static void __init tsc_disable_clocksource_watchdog(void) bool tsc_clocksource_watchdog_disabled(void) { - return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY); + return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && + tsc_as_watchdog && !no_tsc_watchdog; } static void __init check_system_tsc_reliable(void) From 5b268d8abaec6cbd4bd70d062e769098d96670aa Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 2 Feb 2023 16:12:14 +0100 Subject: [PATCH 14/30] time/debug: Fix memory leak with using debugfs_lookup() When calling debugfs_lookup() the result must have dput() called on it, otherwise the memory will leak over time. To make things simpler, just call debugfs_lookup_and_remove() instead which handles all of the logic at once. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230202151214.2306822-1-gregkh@linuxfoundation.org --- kernel/time/test_udelay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index 13b11eb62685..20d5df631570 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -149,7 +149,7 @@ module_init(udelay_test_init); static void __exit udelay_test_exit(void) { mutex_lock(&udelay_test_lock); - debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL)); + debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL); mutex_unlock(&udelay_test_lock); } From b3cbfb7927922e4586f890a70a22f13130247b4e Mon Sep 17 00:00:00 2001 From: Jagan Teki Date: Thu, 24 Nov 2022 00:01:18 +0530 Subject: [PATCH 15/30] dt-bindings: timer: rk-timer: Add rktimer for rv1126 Add rockchip timer compatible string for rockchip rv1126. Signed-off-by: Jagan Teki Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20221123183124.6911-3-jagan@edgeble.ai Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml index b61ed1a431bb..65e59836a660 100644 --- a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml +++ b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml @@ -17,6 +17,7 @@ properties: - items: - enum: - rockchip,rv1108-timer + - rockchip,rv1126-timer - rockchip,rk3036-timer - rockchip,rk3128-timer - rockchip,rk3188-timer From 8b3b8fbb4896984b5564789a42240e4b3caddb61 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 3 Jan 2023 19:41:00 +0530 Subject: [PATCH 16/30] RISC-V: time: initialize hrtimer based broadcast clock event device Similarly to commit 022eb8ae8b5e ("ARM: 8938/1: kernel: initialize broadcast hrtimer based clock event device"), RISC-V needs to initiate hrtimer based broadcast clock event device before C3STOP can be used. Otherwise, the introduction of C3STOP for the RISC-V arch timer in commit 232ccac1bd9b ("clocksource/drivers/riscv: Events are stopped during CPU suspend") leaves us without any broadcast timer registered. This prevents the kernel from entering oneshot mode, which breaks timer behaviour, for example clock_nanosleep(). A test app that sleeps each cpu for 6, 5, 4, 3 ms respectively, HZ=250 & C3STOP enabled, the sleep times are rounded up to the next jiffy: == CPU: 1 == == CPU: 2 == == CPU: 3 == == CPU: 4 == Mean: 7.974992 Mean: 7.976534 Mean: 7.962591 Mean: 3.952179 Std Dev: 0.154374 Std Dev: 0.156082 Std Dev: 0.171018 Std Dev: 0.076193 Hi: 9.472000 Hi: 10.495000 Hi: 8.864000 Hi: 4.736000 Lo: 6.087000 Lo: 6.380000 Lo: 4.872000 Lo: 3.403000 Samples: 521 Samples: 521 Samples: 521 Samples: 521 Link: https://lore.kernel.org/linux-riscv/YzYTNQRxLr7Q9JR0@spud/ Fixes: 232ccac1bd9b ("clocksource/drivers/riscv: Events are stopped during CPU suspend") Suggested-by: Samuel Holland Signed-off-by: Conor Dooley Signed-off-by: Anup Patel Reviewed-by: Samuel Holland Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20230103141102.772228-2-apatel@ventanamicro.com Signed-off-by: Daniel Lezcano --- arch/riscv/kernel/time.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 8217b0f67c6c..1cf21db4fcc7 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -29,6 +30,8 @@ void __init time_init(void) of_clk_init(NULL); timer_probe(); + + tick_setup_hrtimer_broadcast(); } void clocksource_arch_init(struct clocksource *cs) From e2bcf2d876fd7ca6ecca09794ac58d7e3a544794 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 3 Jan 2023 19:41:01 +0530 Subject: [PATCH 17/30] dt-bindings: timer: Add bindings for the RISC-V timer device We add DT bindings for a separate RISC-V timer DT node which can be used to describe implementation specific behaviour (such as timer interrupt not triggered during non-retentive suspend). Signed-off-by: Anup Patel Reviewed-by: Conor Dooley Reviewed-by: Rob Herring Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20230103141102.772228-3-apatel@ventanamicro.com Signed-off-by: Daniel Lezcano --- .../bindings/timer/riscv,timer.yaml | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 Documentation/devicetree/bindings/timer/riscv,timer.yaml diff --git a/Documentation/devicetree/bindings/timer/riscv,timer.yaml b/Documentation/devicetree/bindings/timer/riscv,timer.yaml new file mode 100644 index 000000000000..38d67e1a5a79 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/riscv,timer.yaml @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/riscv,timer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: RISC-V timer + +maintainers: + - Anup Patel + +description: |+ + RISC-V platforms always have a RISC-V timer device for the supervisor-mode + based on the time CSR defined by the RISC-V privileged specification. The + timer interrupts of this device are configured using the RISC-V SBI Time + extension or the RISC-V Sstc extension. + + The clock frequency of RISC-V timer device is specified via the + "timebase-frequency" DT property of "/cpus" DT node which is described + in Documentation/devicetree/bindings/riscv/cpus.yaml + +properties: + compatible: + enum: + - riscv,timer + + interrupts-extended: + minItems: 1 + maxItems: 4096 # Should be enough? + + riscv,timer-cannot-wake-cpu: + type: boolean + description: + If present, the timer interrupt cannot wake up the CPU from one or + more suspend/idle states. + +additionalProperties: false + +required: + - compatible + - interrupts-extended + +examples: + - | + timer { + compatible = "riscv,timer"; + interrupts-extended = <&cpu1intc 5>, + <&cpu2intc 5>, + <&cpu3intc 5>, + <&cpu4intc 5>; + }; +... From 8932a9533a9cdd1fa2924a061dc87277991507ca Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 3 Jan 2023 19:41:02 +0530 Subject: [PATCH 18/30] clocksource/drivers/timer-riscv: Set CLOCK_EVT_FEAT_C3STOP based on DT We should set CLOCK_EVT_FEAT_C3STOP for a clock_event_device only when riscv,timer-cannot-wake-cpu DT property is present in the RISC-V timer DT node. This way CLOCK_EVT_FEAT_C3STOP feature is set for clock_event_device based on RISC-V platform capabilities rather than having it set for all RISC-V platforms. Signed-off-by: Anup Patel Reviewed-by: Conor Dooley Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20230103141102.772228-4-apatel@ventanamicro.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-riscv.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index a0d66fabf073..1b4b36df5484 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -28,6 +28,7 @@ #include static DEFINE_STATIC_KEY_FALSE(riscv_sstc_available); +static bool riscv_timer_cannot_wake_cpu; static int riscv_clock_next_event(unsigned long delta, struct clock_event_device *ce) @@ -85,6 +86,8 @@ static int riscv_timer_starting_cpu(unsigned int cpu) ce->cpumask = cpumask_of(cpu); ce->irq = riscv_clock_event_irq; + if (riscv_timer_cannot_wake_cpu) + ce->features |= CLOCK_EVT_FEAT_C3STOP; clockevents_config_and_register(ce, riscv_timebase, 100, 0x7fffffff); enable_percpu_irq(riscv_clock_event_irq, @@ -139,6 +142,13 @@ static int __init riscv_timer_init_dt(struct device_node *n) if (cpuid != smp_processor_id()) return 0; + child = of_find_compatible_node(NULL, NULL, "riscv,timer"); + if (child) { + riscv_timer_cannot_wake_cpu = of_property_read_bool(child, + "riscv,timer-cannot-wake-cpu"); + of_node_put(child); + } + domain = NULL; child = of_get_compatible_child(n, "riscv,cpu-intc"); if (!child) { From 674402b0098b66b8ba91fe93c0d27af703256098 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 27 Dec 2022 18:44:44 -0600 Subject: [PATCH 19/30] clocksource/drivers/riscv: Increase the clock source rating RISC-V provides an architectural clock source via the time CSR. This clock source exposes a 64-bit counter synchronized across all CPUs. Because it is accessed using a CSR, it is much more efficient to read than MMIO clock sources. For example, on the Allwinner D1, reading the sun4i timer in a loop takes 131 cycles/iteration, while reading the RISC-V time CSR takes only 5 cycles/iteration. Adjust the RISC-V clock source rating so it is preferred over the various platform-specific MMIO clock sources. Signed-off-by: Samuel Holland Acked-by: Palmer Dabbelt Reviewed-by: Palmer Dabbelt Reviewed-by: Anup Patel Reviewed-by: Lad Prabhakar Link: https://lore.kernel.org/r/20221228004444.61568-1-samuel@sholland.org Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index 1b4b36df5484..adf7f98aab17 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -74,7 +74,7 @@ static u64 notrace riscv_sched_clock(void) static struct clocksource riscv_clocksource = { .name = "riscv_clocksource", - .rating = 300, + .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, .read = riscv_clocksource_rdtime, From 8d17aca90bcf36833271de6531edb58a91c40e9f Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Sat, 21 Jan 2023 18:29:11 +0100 Subject: [PATCH 20/30] clocksource/drivers/timer-microchip-pit64b: Drop obsolete dependency on COMPILE_TEST Since commit 0166dc11be91 ("of: make CONFIG_OF user selectable"), it is possible to test-build any driver which depends on OF on any architecture by explicitly selecting OF. Therefore depending on COMPILE_TEST as an alternative is no longer needed. Signed-off-by: Jean Delvare Cc: Claudiu Beznea Cc: Daniel Lezcano Cc: Thomas Gleixner Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20230121182911.4e47a5ff@endymion.delvare Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 4469e7f555e9..45085abbe197 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -706,7 +706,7 @@ config INGENIC_OST config MICROCHIP_PIT64B bool "Microchip PIT64B support" - depends on OF || COMPILE_TEST + depends on OF select TIMER_OF help This option enables Microchip PIT64B timer for Atmel From c3daa4754f3c57231bf47dcf4bdf897bc5c5f1f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 23 Jan 2023 23:02:21 +0100 Subject: [PATCH 21/30] clocksource/drivers/sh_cmt: Mark driver as non-removable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment in the remove callback suggests that the driver is not supposed to be unbound. However returning an error code in the remove callback doesn't accomplish that. Instead set the suppress_bind_attrs property (which makes it impossible to unbind the driver via sysfs). The only remaining way to unbind a sh_cmt device would be module unloading, but that doesn't apply here, as the driver cannot be built as a module. Also drop the useless remove callback. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20230123220221.48164-1-u.kleine-koenig@pengutronix.de Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_cmt.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 7b952aa52c0b..8b2e079d9df2 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -1145,17 +1145,12 @@ static int sh_cmt_probe(struct platform_device *pdev) return 0; } -static int sh_cmt_remove(struct platform_device *pdev) -{ - return -EBUSY; /* cannot unregister clockevent and clocksource */ -} - static struct platform_driver sh_cmt_device_driver = { .probe = sh_cmt_probe, - .remove = sh_cmt_remove, .driver = { .name = "sh_cmt", .of_match_table = of_match_ptr(sh_cmt_of_table), + .suppress_bind_attrs = true, }, .id_table = sh_cmt_id_table, }; From 3aff0403f814df6ce2377a6ecf61dd7750a3925f Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 29 Dec 2022 22:46:01 +0000 Subject: [PATCH 22/30] clocksource/drivers/riscv: Get rid of clocksource_arch_init() callback Having a clocksource_arch_init() callback always sets vdso_clock_mode to VDSO_CLOCKMODE_ARCHTIMER if GENERIC_GETTIMEOFDAY is enabled, this is required for the riscv-timer. This works for platforms where just riscv-timer clocksource is present. On platforms where other clock sources are available we want them to register with vdso_clock_mode set to VDSO_CLOCKMODE_NONE. On the Renesas RZ/Five SoC OSTM block can be used as clocksource [0], to avoid multiple clock sources being registered as VDSO_CLOCKMODE_ARCHTIMER move setting of vdso_clock_mode in the riscv-timer driver instead of doing this in clocksource_arch_init() callback as done similarly for ARM/64 architecture. [0] drivers/clocksource/renesas-ostm.c Signed-off-by: Lad Prabhakar Tested-by: Samuel Holland Reviewed-by: Conor Dooley Reviewed-by: Samuel Holland Link: https://lore.kernel.org/r/20221229224601.103851-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Daniel Lezcano --- arch/riscv/Kconfig | 1 - arch/riscv/kernel/time.c | 9 --------- drivers/clocksource/timer-riscv.c | 5 +++++ 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e2b656043abf..9c687da7756d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -12,7 +12,6 @@ config 32BIT config RISCV def_bool y - select ARCH_CLOCKSOURCE_INIT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_HAS_BINFMT_FLAT diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 1cf21db4fcc7..babaf3b48ba8 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -33,12 +33,3 @@ void __init time_init(void) tick_setup_hrtimer_broadcast(); } - -void clocksource_arch_init(struct clocksource *cs) -{ -#ifdef CONFIG_GENERIC_GETTIMEOFDAY - cs->vdso_clock_mode = VDSO_CLOCKMODE_ARCHTIMER; -#else - cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; -#endif -} diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index adf7f98aab17..d8cb629d3aab 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -78,6 +78,11 @@ static struct clocksource riscv_clocksource = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, .read = riscv_clocksource_rdtime, +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY) + .vdso_clock_mode = VDSO_CLOCKMODE_ARCHTIMER, +#else + .vdso_clock_mode = VDSO_CLOCKMODE_NONE, +#endif }; static int riscv_timer_starting_cpu(unsigned int cpu) From 27788e01a6507c882946b9aa87318c3185cd83e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Rosenkr=C3=A4nzer?= Date: Wed, 25 Jan 2023 15:35:01 +0100 Subject: [PATCH 23/30] dt-bindings: timer: mediatek,mtk-timer: add MT8365 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add binding description for mediatek,mt8365-systimer Signed-off-by: Bernhard Rosenkränzer Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230125143503.1015424-8-bero@baylibre.com Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt index 8bbb6e94508b..b3e797e8aa31 100644 --- a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt +++ b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt @@ -33,6 +33,7 @@ Required properties: For those SoCs that use CPUX * "mediatek,mt6795-systimer" for MT6795 compatible timers (CPUX) + * "mediatek,mt8365-systimer" for MT8365 compatible timers (CPUX) - reg: Should contain location and length for timer register. - clocks: Should contain system clock. From abd873afc889c0b4348ec4b567d83f97df8edaf6 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Thu, 2 Feb 2023 15:28:14 +0800 Subject: [PATCH 24/30] dt-bindings: timer: sifive,clint: add comaptibles for T-Head's C9xx T-Head C906/C910 CLINT is not compliant to SiFive ones (and even not compliant to the newcoming ACLINT spec) because of lack of mtime register. Add a compatible string formatted like the C9xx-specific PLIC compatible, and do not allow a SiFive one as fallback because they're not really compliant. Signed-off-by: Icenowy Zheng Acked-by: Krzysztof Kozlowski Reviewed-by: Samuel Holland Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20230202072814.319903-1-uwu@icenowy.me Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/timer/sifive,clint.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/timer/sifive,clint.yaml b/Documentation/devicetree/bindings/timer/sifive,clint.yaml index bbad24165837..aada6957216c 100644 --- a/Documentation/devicetree/bindings/timer/sifive,clint.yaml +++ b/Documentation/devicetree/bindings/timer/sifive,clint.yaml @@ -20,6 +20,10 @@ description: property of "/cpus" DT node. The "timebase-frequency" DT property is described in Documentation/devicetree/bindings/riscv/cpus.yaml + T-Head C906/C910 CPU cores include an implementation of CLINT too, however + their implementation lacks a memory-mapped MTIME register, thus not + compatible with SiFive ones. + properties: compatible: oneOf: @@ -29,6 +33,10 @@ properties: - starfive,jh7100-clint - canaan,k210-clint - const: sifive,clint0 + - items: + - enum: + - allwinner,sun20i-d1-clint + - const: thead,c900-clint - items: - const: sifive,clint0 - const: riscv,clint0 From d19c8b2ed176c6aaf3dfefd149c740f73ce4875e Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Fri, 3 Feb 2023 15:05:36 +0200 Subject: [PATCH 25/30] clocksource/drivers/timer-microchip-pit64b: Select driver only on ARM Microchip PIT64B is currently available on ARM based devices. Thus select it only for ARM. This allows implementing delay timer. Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20230203130537.1921608-2-claudiu.beznea@microchip.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 45085abbe197..ac22c1af056b 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -706,7 +706,7 @@ config INGENIC_OST config MICROCHIP_PIT64B bool "Microchip PIT64B support" - depends on OF + depends on OF && ARM select TIMER_OF help This option enables Microchip PIT64B timer for Atmel From f3af3dc7cc351bd22742eac992c72c76749ce8c3 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Fri, 3 Feb 2023 15:05:37 +0200 Subject: [PATCH 26/30] clocksource/drivers/timer-microchip-pit64b: Add delay timer Add delay timer. Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20230203130537.1921608-3-claudiu.beznea@microchip.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-microchip-pit64b.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/clocksource/timer-microchip-pit64b.c b/drivers/clocksource/timer-microchip-pit64b.c index d5f1436f33d9..57209bb38c70 100644 --- a/drivers/clocksource/timer-microchip-pit64b.c +++ b/drivers/clocksource/timer-microchip-pit64b.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -92,6 +93,8 @@ struct mchp_pit64b_clksrc { static void __iomem *mchp_pit64b_cs_base; /* Default cycles for clockevent timer. */ static u64 mchp_pit64b_ce_cycles; +/* Delay timer. */ +static struct delay_timer mchp_pit64b_dt; static inline u64 mchp_pit64b_cnt_read(void __iomem *base) { @@ -169,6 +172,11 @@ static u64 notrace mchp_pit64b_sched_read_clk(void) return mchp_pit64b_cnt_read(mchp_pit64b_cs_base); } +static unsigned long notrace mchp_pit64b_dt_read(void) +{ + return mchp_pit64b_cnt_read(mchp_pit64b_cs_base); +} + static int mchp_pit64b_clkevt_shutdown(struct clock_event_device *cedev) { struct mchp_pit64b_timer *timer = clkevt_to_mchp_pit64b_timer(cedev); @@ -376,6 +384,10 @@ static int __init mchp_pit64b_init_clksrc(struct mchp_pit64b_timer *timer, sched_clock_register(mchp_pit64b_sched_read_clk, 64, clk_rate); + mchp_pit64b_dt.read_current_timer = mchp_pit64b_dt_read; + mchp_pit64b_dt.freq = clk_rate; + register_current_timer_delay(&mchp_pit64b_dt); + return 0; } From 225b9596cb0227c1c1b1e4a836dad43595c3e61a Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Wed, 1 Feb 2023 19:49:42 +0000 Subject: [PATCH 27/30] clocksource/drivers/riscv: Patch riscv_clock_next_event() jump before first use A static key is used to select between SBI and Sstc timer usage in riscv_clock_next_event(), but currently the direction is resolved after cpuhp_setup_state() is called (which sets the next event). The first event will therefore fall through the sbi_set_timer() path; this breaks Sstc-only systems. So, apply the jump patching before first use. Fixes: 9f7a8ff6391f ("RISC-V: Prefer sstc extension if available") Signed-off-by: Matt Evans Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/CDDAB2D0-264E-42F3-8E31-BA210BEB8EC1@rivosinc.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-riscv.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index d8cb629d3aab..5f0f10c7e222 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -192,6 +192,11 @@ static int __init riscv_timer_init_dt(struct device_node *n) return error; } + if (riscv_isa_extension_available(NULL, SSTC)) { + pr_info("Timer interrupt in S-mode is available via sstc extension\n"); + static_branch_enable(&riscv_sstc_available); + } + error = cpuhp_setup_state(CPUHP_AP_RISCV_TIMER_STARTING, "clockevents/riscv/timer:starting", riscv_timer_starting_cpu, riscv_timer_dying_cpu); @@ -199,11 +204,6 @@ static int __init riscv_timer_init_dt(struct device_node *n) pr_err("cpu hp setup state failed for RISCV timer [%d]\n", error); - if (riscv_isa_extension_available(NULL, SSTC)) { - pr_info("Timer interrupt in S-mode is available via sstc extension\n"); - static_branch_enable(&riscv_sstc_available); - } - return error; } From d8c695d310a8607fd1d6a2750368467c4e0cef6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 7 Feb 2023 20:36:14 +0100 Subject: [PATCH 28/30] clocksource/drivers/sh_tmu: Mark driver as non-removable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment in the remove callback suggests that the driver is not supposed to be unbound. However returning an error code in the remove callback doesn't accomplish that. Instead set the suppress_bind_attrs property (which makes it impossible to unbind the driver via sysfs). The only remaining way to unbind a sh_tmu device would be module unloading, but that doesn't apply here, as the driver cannot be built as a module. Also drop the useless remove callback. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20230207193614.472060-1-u.kleine-koenig@pengutronix.de Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_tmu.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c index b00dec0655cb..932f31a7c5be 100644 --- a/drivers/clocksource/sh_tmu.c +++ b/drivers/clocksource/sh_tmu.c @@ -632,11 +632,6 @@ static int sh_tmu_probe(struct platform_device *pdev) return 0; } -static int sh_tmu_remove(struct platform_device *pdev) -{ - return -EBUSY; /* cannot unregister clockevent and clocksource */ -} - static const struct platform_device_id sh_tmu_id_table[] = { { "sh-tmu", SH_TMU }, { "sh-tmu-sh3", SH_TMU_SH3 }, @@ -652,10 +647,10 @@ MODULE_DEVICE_TABLE(of, sh_tmu_of_table); static struct platform_driver sh_tmu_device_driver = { .probe = sh_tmu_probe, - .remove = sh_tmu_remove, .driver = { .name = "sh_tmu", .of_match_table = of_match_ptr(sh_tmu_of_table), + .suppress_bind_attrs = true, }, .id_table = sh_tmu_id_table, }; From cf16f631b0bbd98df4b2ccc13346f564412eae68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 7 Feb 2023 20:30:10 +0100 Subject: [PATCH 29/30] clocksource/drivers/em_sti: Mark driver as non-removable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment in the remove callback suggests that the driver is not supposed to be unbound. However returning an error code in the remove callback doesn't accomplish that. Instead set the suppress_bind_attrs property (which makes it impossible to unbind the driver via sysfs). The only remaining way to unbind a em_sti device would be module unloading, but that doesn't apply here, as the driver cannot be built as a module. Also drop the useless remove callback. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20230207193010.469495-1-u.kleine-koenig@pengutronix.de Signed-off-by: Daniel Lezcano --- drivers/clocksource/em_sti.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c index ab190dffb1ed..c04b47bd4868 100644 --- a/drivers/clocksource/em_sti.c +++ b/drivers/clocksource/em_sti.c @@ -333,11 +333,6 @@ static int em_sti_probe(struct platform_device *pdev) return 0; } -static int em_sti_remove(struct platform_device *pdev) -{ - return -EBUSY; /* cannot unregister clockevent and clocksource */ -} - static const struct of_device_id em_sti_dt_ids[] = { { .compatible = "renesas,em-sti", }, {}, @@ -346,10 +341,10 @@ MODULE_DEVICE_TABLE(of, em_sti_dt_ids); static struct platform_driver em_sti_device_driver = { .probe = em_sti_probe, - .remove = em_sti_remove, .driver = { .name = "em_sti", .of_match_table = em_sti_dt_ids, + .suppress_bind_attrs = true, } }; From 5ccb51b06c8bfcfca99d3f68e671eead7318301d Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 9 Feb 2023 12:02:39 +0800 Subject: [PATCH 30/30] clocksource/drivers/timer-sun4i: Add CLOCK_EVT_FEAT_DYNIRQ Add CLOCK_EVT_FEAT_DYNIRQ to allow the IRQ could be runtime set affinity to the cores that needs wake up, otherwise saying core0 has to send IPI to wakeup core1. With CLOCK_EVT_FEAT_DYNIRQ set, when broadcast timer could wake up the cores, IPI is not needed. After enabling this feature, especially the scene where cpuidle is enabled can benefit. Signed-off-by: Yangtao Li Link: https://lore.kernel.org/r/20230209040239.24710-1-frank.li@vivo.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-sun4i.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-sun4i.c b/drivers/clocksource/timer-sun4i.c index e5a70aa1deb4..7bdcc60ad43c 100644 --- a/drivers/clocksource/timer-sun4i.c +++ b/drivers/clocksource/timer-sun4i.c @@ -144,7 +144,8 @@ static struct timer_of to = { .clkevt = { .name = "sun4i_tick", .rating = 350, - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_DYNIRQ, .set_state_shutdown = sun4i_clkevt_shutdown, .set_state_periodic = sun4i_clkevt_set_periodic, .set_state_oneshot = sun4i_clkevt_set_oneshot,