diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5db055821ef3..69551020bb97 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -163,6 +163,7 @@ enum hrtimer_base_type { * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events + * @migration_enabled: The migration of hrtimers to other cpus is enabled * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @next_timer: Pointer to the first expiring timer @@ -186,6 +187,7 @@ struct hrtimer_cpu_base { unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; + bool migration_enabled; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int in_hrtirq : 1, hres_active : 1, diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..d7151460b0cf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -335,14 +335,10 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); -extern int get_nohz_timer_target(int pinned); +extern int get_nohz_timer_target(void); #else static inline void nohz_balance_enter_idle(int cpu) { } static inline void set_cpu_sd_state_idle(void) { } -static inline int get_nohz_timer_target(int pinned) -{ - return smp_processor_id(); -} #endif /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 596a0e007c62..c9e4731cf10b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; extern unsigned int sysctl_sched_shares_window; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} -#else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif /* * control realtime throttling: diff --git a/include/linux/timer.h b/include/linux/timer.h index ff0689b6e297..61aa61dc410c 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -238,6 +238,15 @@ extern void run_local_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#include + +extern unsigned int sysctl_timer_migration; +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0ef80a0bbabb..d72fa24f2312 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1432,8 +1432,6 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_active; - /* * Try to advance callbacks for all flavors of RCU on the current CPU, but * only if it has been awhile since the last time we did so. Afterwards, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ecb7c4216350..e9f25ce70c77 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -572,13 +572,12 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(int pinned) +int get_nohz_timer_target(void) { - int cpu = smp_processor_id(); - int i; + int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + if (!idle_cpu(cpu)) return cpu; rcu_read_lock(); @@ -7050,8 +7049,6 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ -const_debug unsigned int sysctl_timer_migration = 1; - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2082b1a88fb9..b13e9d2de302 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -349,15 +349,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -1132,6 +1123,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + }, +#endif { } }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f026413de4d6..6115f4df119b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -177,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&hrtimer_bases); + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +} +#else +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + return this_cpu_ptr(&hrtimer_bases); +} +#endif + /* * Switch the timer base to the current CPU when possible. */ @@ -184,14 +202,13 @@ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { + struct hrtimer_cpu_base *new_cpu_base, *this_base; struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = get_nohz_timer_target(pinned); int basenum = base->index; + this_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_base, pinned); again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { @@ -212,17 +229,19 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_base; goto again; } } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index ec2208aabdd1..2edde84744df 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -149,4 +149,18 @@ extern void tick_nohz_init(void); static inline void tick_nohz_init(void) { } #endif +#ifdef CONFIG_NO_HZ_COMMON +extern unsigned long tick_nohz_active; +#else +#define tick_nohz_active (0) +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void timers_update_migration(void); +#else +static inline void timers_update_migration(void) { } +#endif + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 812f7a3b9898..b1cb01699355 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -399,7 +399,7 @@ void __init tick_nohz_init(void) * NO HZ enabled ? */ static int tick_nohz_enabled __read_mostly = 1; -int tick_nohz_active __read_mostly; +unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -956,6 +956,16 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) +{ + if (!tick_nohz_enabled) + return; + ts->nohz_mode = mode; + /* One update is enough */ + if (!test_and_set_bit(0, &tick_nohz_active)) + timers_update_migration(); +} + /** * tick_nohz_switch_to_nohz - switch to nohz mode */ @@ -970,9 +980,6 @@ static void tick_nohz_switch_to_nohz(void) if (tick_switch_to_oneshot(tick_nohz_handler)) return; - tick_nohz_active = 1; - ts->nohz_mode = NOHZ_MODE_LOWRES; - /* * Recycle the hrtimer in ts, so we can share the * hrtimer_forward with the highres code. @@ -984,6 +991,7 @@ static void tick_nohz_switch_to_nohz(void) hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); tick_program_event(next, 1); + tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } /* @@ -1035,6 +1043,7 @@ static inline void tick_nohz_irq_enter(void) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -1117,13 +1126,7 @@ void tick_setup_sched_timer(void) hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - -#ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) { - ts->nohz_mode = NOHZ_MODE_HIGHRES; - tick_nohz_active = 1; - } -#endif + tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3398d93c74a7..343142ed996a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -85,6 +85,7 @@ struct tvec_base { unsigned long active_timers; unsigned long all_timers; int cpu; + bool migration_enabled; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -95,6 +96,54 @@ struct tvec_base { static DEFINE_PER_CPU(struct tvec_base, tvec_bases); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +unsigned int sysctl_timer_migration = 1; + +void timers_update_migration(void) +{ + bool on = sysctl_timer_migration && tick_nohz_active; + unsigned int cpu; + + /* Avoid the loop, if nothing to update */ + if (this_cpu_read(tvec_bases.migration_enabled) == on) + return; + + for_each_possible_cpu(cpu) { + per_cpu(tvec_bases.migration_enabled, cpu) = on; + per_cpu(hrtimer_bases.migration_enabled, cpu) = on; + } +} + +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(); + mutex_unlock(&mutex); + return ret; +} + +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&tvec_bases); + return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); +} +#else +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) +{ + return this_cpu_ptr(&tvec_bases); +} +#endif + static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) { @@ -716,11 +765,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) + bool pending_only, int pinned) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0 , cpu; + int ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -733,8 +782,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = get_nohz_timer_target(pinned); - new_base = per_cpu_ptr(&tvec_bases, cpu); + new_base = get_target_base(base, pinned); if (base != new_base) { /* @@ -751,7 +799,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + timer->flags &= ~TIMER_BASEMASK; + timer->flags |= base->cpu; } } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1327004429be..a4536e1e3e2a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -29,8 +29,6 @@ struct timer_list_iter { typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); -DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); - /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q):