From cedce3e730833d26a37826a96e1905b6ef387df9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 4 Jul 2013 22:48:20 +0400 Subject: [PATCH 01/16] sched/__wake_up_sync_key(): Fix nr_exclusive tasks which lead to WF_SYNC clearing Only one task can replace the waker. Signed-off-by: Kirill Tkhai CC: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/512421372963700@web25f.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d8eb4525e76..f73787159188 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2660,7 +2660,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, if (unlikely(!q)) return; - if (unlikely(!nr_exclusive)) + if (unlikely(nr_exclusive != 1)) wake_flags = 0; spin_lock_irqsave(&q->lock, flags); From 87e3c8ae1c8676b9dd56b56456dafa14a4bacf97 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sun, 21 Jul 2013 04:32:07 +0400 Subject: [PATCH 02/16] sched/fair: Cleanup: remove duplicate variable declaration cfs_rq is declared twice, fix it. Also use 'se' instead of '&p->se'. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/169201374366727@web6d.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb456f44b7b1..ab599781129d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5889,11 +5889,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) * and ensure we don't carry in an old decay_count if we * switch back. */ - if (p->se.avg.decay_count) { - struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); - __synchronize_entity_decay(&p->se); - subtract_blocked_load_contrib(cfs_rq, - p->se.avg.load_avg_contrib); + if (se->avg.decay_count) { + __synchronize_entity_decay(se); + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); } #endif } From 323f54ed0f3ce20e9946c961fc928ccdb80d9345 Mon Sep 17 00:00:00 2001 From: Tom Rini Date: Thu, 25 Jul 2013 14:26:10 -0400 Subject: [PATCH 03/16] numa: Mark __node_set() as __always_inline It is posible for some compilers to decide that __node_set() does not need to be made turned into an inline function. When the compiler does this on an __init function calling it on __initdata we get a section mismatch warning now. Use __always_inline to ensure that we will be inlined. Reported-by: Paul Bolle Cc: Jianpeng Ma Cc: Rusty Russell Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Wen Congyang Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Greg KH Signed-off-by: Tom Rini Link: http://lkml.kernel.org/r/1374776770-32361-1-git-send-email-trini@ti.com Signed-off-by: Ingo Molnar --- include/linux/nodemask.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 4e2cbfa640b7..58b9a02c38d2 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -98,8 +98,17 @@ typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; +/* + * The inline keyword gives the compiler room to decide to inline, or + * not inline a function as it sees best. However, as these functions + * are called in both __init and non-__init functions, if they are not + * inlined we will end up with a section mis-match error (of the type of + * freeable items not being freed). So we must use __always_inline here + * to fix the problem. If other functions in the future also end up in + * this situation they will also need to be annotated as __always_inline + */ #define node_set(node, dst) __node_set((node), &(dst)) -static inline void __node_set(int node, volatile nodemask_t *dstp) +static __always_inline void __node_set(int node, volatile nodemask_t *dstp) { set_bit(node, dstp->bits); } From 46591962cb5bfd2bfb0baf42497119c816503598 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 30 Jul 2013 11:06:09 +0800 Subject: [PATCH 04/16] generic-ipi: Kill unnecessary variable - csd_flags After commit 8969a5ede0f9e17da4b943712429aef2c9bcd82b ("generic-ipi: remove kmalloc()"), wait = 0 can be guaranteed, and all callsites of generic_exec_single() do an unconditional csd_lock() now. So csd_flags is unnecessary now. Remove it. Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Nick Piggin Cc: Jens Axboe Cc: "Paul E. McKenney" Cc: Rusty Russell Link: http://lkml.kernel.org/r/51F72DA1.7010401@huawei.com Signed-off-by: Ingo Molnar --- kernel/smp.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index fe9f773d7114..7332697cd184 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -186,25 +186,13 @@ void generic_smp_call_function_single_interrupt(void) while (!list_empty(&list)) { struct call_single_data *csd; - unsigned int csd_flags; csd = list_entry(list.next, struct call_single_data, list); list_del(&csd->list); - /* - * 'csd' can be invalid after this call if flags == 0 - * (when called through generic_exec_single()), - * so save them away before making the call: - */ - csd_flags = csd->flags; - csd->func(csd->info); - /* - * Unlocked CSDs are valid through generic_exec_single(): - */ - if (csd_flags & CSD_FLAG_LOCK) - csd_unlock(csd); + csd_unlock(csd); } } From 8f898fbbe5ee5e20a77c4074472a1fd088dc47d1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 31 Jul 2013 22:14:21 -0400 Subject: [PATCH 05/16] sched/x86: Optimize switch_mm() for multi-threaded workloads Dick Fowles, Don Zickus and Joe Mario have been working on improvements to perf, and noticed heavy cache line contention on the mm_cpumask, running linpack on a 60 core / 120 thread system. The cause turned out to be unnecessary atomic accesses to the mm_cpumask. When in lazy TLB mode, the CPU is only removed from the mm_cpumask if there is a TLB flush event. Most of the time, no such TLB flush happens, and the kernel skips the TLB reload. It can also skip the atomic memory set & test. Here is a summary of Joe's test results: * The __schedule function dropped from 24% of all program cycles down to 5.5%. * The cacheline contention/hotness for accesses to that bitmask went from being the 1st/2nd hottest - down to the 84th hottest (0.3% of all shared misses which is now quite cold) * The average load latency for the bit-test-n-set instruction in __schedule dropped from 10k-15k cycles down to an average of 600 cycles. * The linpack program results improved from 133 GFlops to 144 GFlops. Peak GFlops rose from 133 to 153. Reported-by: Don Zickus Reported-by: Joe Mario Tested-by: Joe Mario Signed-off-by: Rik van Riel Reviewed-by: Paul Turner Acked-by: Linus Torvalds Link: http://lkml.kernel.org/r/20130731221421.616d3d20@annuminas.surriel.com [ Made the comments consistent around the modified code. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index cdbf36776106..be12c534fd59 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -45,22 +45,28 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Re-load page tables */ load_cr3(next->pgd); - /* stop flush ipis for the previous mm */ + /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); - /* - * load the LDT, if the LDT is different: - */ + /* Load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP - else { + else { this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { - /* We were in lazy tlb mode and leave_mm disabled + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + /* + * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */ From c8d2d47a9cbb4222ae4e993aa0e3703430c8193c Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 6 Aug 2013 20:06:42 +0800 Subject: [PATCH 06/16] cpumask: Fix cpumask leak in partition_sched_domains() If doms_new is NULL, partition_sched_domains() will reset ndoms_cur to 0, and free old sched domains with free_sched_domains(doms_cur, ndoms_cur). As ndoms_cur is 0, the cpumask will not be freed. Signed-off-by: Xiaotian Feng Cc: Rusty Russell Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375790802-11857-1-git-send-email-xtfeng@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7415cfdd7de..cf8f100433e0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6184,8 +6184,9 @@ match1: ; } + n = ndoms_cur; if (doms_new == NULL) { - ndoms_cur = 0; + n = 0; doms_new = &fallback_doms; cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); @@ -6193,7 +6194,7 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur && !new_topology; j++) { + for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; From a4f61cc03e443647211a5ae0ab8f8cda2e9e1043 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 7 Aug 2013 15:38:24 +0000 Subject: [PATCH 07/16] sched/cputime: Use this_cpu_add() in task_group_account_field() Use of a this_cpu() operation reduces the number of instructions used for accounting (account_user_time()) and frees up some registers. This is in the scheduler tick hotpath. Signed-off-by: Christoph Lameter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/00000140596dd165-338ff7f5-893b-4fec-b251-aaac5557239e-000000@email.amazonses.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..e89ccefef278 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, * is the only cgroup, then nothing else should be necessary. * */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + __this_cpu_add(kernel_cpustat.cpustat[index], tmp); cpuacct_account_field(p, index, tmp); } From 95a79b805b935f4a7b685aa8a117d916c638323e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 6 Aug 2013 17:36:41 +0900 Subject: [PATCH 08/16] sched: Remove one division operation in find_busiest_queue() Remove one division operation in find_busiest_queue() by using crosswise multiplication: wl_i / power_i > wl_j / power_j := wl_i * power_j > wl_j * power_i Signed-off-by: Joonsoo Kim [ Expanded the changelog. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375778203-31343-2-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f918635efe09..8aa217f62a9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4968,7 +4968,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long max_load = 0; + unsigned long busiest_load = 0, busiest_power = 1; int i; for_each_cpu(i, sched_group_cpus(group)) { @@ -4998,11 +4998,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, * the weighted_cpuload() scaled with the cpu power, so that * the load can be moved away from the cpu that is potentially * running at a lower capacity. + * + * Thus we're looking for max(wl_i / power_i), crosswise + * multiplication to rid ourselves of the division works out + * to: wl_i * power_j > wl_j * power_i; where j is our + * previous maximum. */ - wl = (wl * SCHED_POWER_SCALE) / power; - - if (wl > max_load) { - max_load = wl; + if (wl * busiest_power > busiest_load * power) { + busiest_load = wl; + busiest_power = power; busiest = rq; } } From 23f0d2093c789e612185180c468fa09063834e87 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 6 Aug 2013 17:36:42 +0900 Subject: [PATCH 09/16] sched: Factor out code to should_we_balance() Now checking whether this cpu is appropriate to balance or not is embedded into update_sg_lb_stats() and this checking has no direct relationship to this function. There is not enough reason to place this checking at update_sg_lb_stats(), except saving one iteration for sched_group_cpus. In this patch, I factor out this checking to should_we_balance() function. And before doing actual work for load_balancing, check whether this cpu is appropriate to balance via should_we_balance(). If this cpu is not a candidate for balancing, it quit the work immediately. With this change, we can save two memset cost and can expect better compiler optimization. Below is result of this patch. * Vanilla * text data bss dec hex filename 34499 1136 116 35751 8ba7 kernel/sched/fair.o * Patched * text data bss dec hex filename 34243 1136 116 35495 8aa7 kernel/sched/fair.o In addition, rename @balance to @continue_balancing in order to represent its purpose more clearly. Signed-off-by: Joonsoo Kim [ s/should_balance/continue_balancing/g ] Reviewed-by: Paul Turner [ Made style changes and a fix in should_we_balance(). ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375778203-31343-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 107 ++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8aa217f62a9e..9a6daf86a76a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4463,22 +4463,17 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, int *balance, struct sg_lb_stats *sgs) + int local_group, struct sg_lb_stats *sgs) { unsigned long nr_running, max_nr_running, min_nr_running; unsigned long load, max_cpu_load, min_cpu_load; - unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; int i; - if (local_group) - balance_cpu = group_balance_cpu(group); - /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; min_cpu_load = ~0UL; @@ -4492,12 +4487,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Bias balancing toward cpus of our domain */ if (local_group) { - if (idle_cpu(i) && !first_idle_cpu && - cpumask_test_cpu(i, sched_group_mask(group))) { - first_idle_cpu = 1; - balance_cpu = i; - } - load = target_load(i, load_idx); } else { load = source_load(i, load_idx); @@ -4519,22 +4508,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (local_group) { - if (env->idle != CPU_NEWLY_IDLE) { - if (balance_cpu != env->dst_cpu) { - *balance = 0; - return; - } - update_group_power(env->sd, env->dst_cpu); - } else if (time_after_eq(jiffies, group->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); - } + if (local_group && (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, group->sgp->next_update))) + update_group_power(env->sd, env->dst_cpu); /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; @@ -4613,7 +4589,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct lb_env *env, - int *balance, struct sd_lb_stats *sds) + struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -4630,10 +4606,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); - - if (local_group && !(*balance)) - return; + update_sg_lb_stats(env, sg, load_idx, local_group, &sgs); sds->total_load += sgs.group_load; sds->total_pwr += sg->sgp->power; @@ -4866,8 +4839,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * to restore balance. * * @env: The load balancing environment. - * @balance: Pointer to a variable indicating if this_cpu - * is the appropriate cpu to perform load balancing at this_level. * * Returns: - the busiest group if imbalance exists. * - If no imbalance and user has opted for power-savings balance, @@ -4875,7 +4846,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * -find_busiest_group(struct lb_env *env, int *balance) +find_busiest_group(struct lb_env *env) { struct sd_lb_stats sds; @@ -4885,14 +4856,7 @@ find_busiest_group(struct lb_env *env, int *balance) * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(env, balance, &sds); - - /* - * this_cpu is not the appropriate cpu to perform load balancing at - * this level. - */ - if (!(*balance)) - goto ret; + update_sd_lb_stats(env, &sds); if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) @@ -4956,7 +4920,6 @@ force_balance: return sds.busiest; out_balanced: -ret: env->imbalance = 0; return NULL; } @@ -5043,13 +5006,47 @@ static int need_active_balance(struct lb_env *env) static int active_load_balance_cpu_stop(void *data); +static int should_we_balance(struct lb_env *env) +{ + struct sched_group *sg = env->sd->groups; + struct cpumask *sg_cpus, *sg_mask; + int cpu, balance_cpu = -1; + + /* + * In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (env->idle == CPU_NEWLY_IDLE) + return 1; + + sg_cpus = sched_group_cpus(sg); + sg_mask = sched_group_mask(sg); + /* Try to find first idle cpu */ + for_each_cpu_and(cpu, sg_cpus, env->cpus) { + if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + continue; + + balance_cpu = cpu; + break; + } + + if (balance_cpu == -1) + balance_cpu = group_balance_cpu(sg); + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above domains. + */ + return balance_cpu != env->dst_cpu; +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) + int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; struct sched_group *group; @@ -5079,11 +5076,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(&env, balance); - - if (*balance == 0) + if (!should_we_balance(&env)) { + *continue_balancing = 0; goto out_balanced; + } + group = find_busiest_group(&env); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -5296,7 +5294,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; - int balance = 1; + int continue_balancing = 1; if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -5304,7 +5302,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, - sd, CPU_NEWLY_IDLE, &balance); + sd, CPU_NEWLY_IDLE, + &continue_balancing); } interval = msecs_to_jiffies(sd->balance_interval); @@ -5542,7 +5541,7 @@ void update_max_interval(void) */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { - int balance = 1; + int continue_balancing = 1; struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; @@ -5574,7 +5573,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_SOME_PINNED logic could have changed * env->dst_cpu, so we can't know our idle @@ -5597,7 +5596,7 @@ out: * CPU in our sched group which is doing load balancing more * actively. */ - if (!balance) + if (!continue_balancing) break; } rcu_read_unlock(); From 56cf515b4b1567c4e8fa9926175b40c66b9ec472 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 6 Aug 2013 17:36:43 +0900 Subject: [PATCH 10/16] sched: Clean-up struct sd_lb_stat There is no reason to maintain separate variables for this_group and busiest_group in sd_lb_stat, except saving some space. But this structure is always allocated in stack, so this saving isn't really benificial [peterz: reducing stack space is good; in this case readability increases enough that I think its still beneficial] This patch unify these variables, so IMO, readability may be improved. Signed-off-by: Joonsoo Kim [ Rename this to local -- avoids confusion between this_cpu and the C++ this pointer. ] Reviewed-by: Paul Turner [ Lots of style edits, a few fixes and a rename. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375778203-31343-4-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 229 ++++++++++++++++++++++---------------------- 1 file changed, 114 insertions(+), 115 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9a6daf86a76a..2da80a55827b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4231,36 +4231,6 @@ static unsigned long task_h_load(struct task_struct *p) #endif /********** Helpers for find_busiest_group ************************/ -/* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. - */ -struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *this; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - - /** Statistics of this group */ - unsigned long this_load; - unsigned long this_load_per_task; - unsigned long this_nr_running; - unsigned long this_has_capacity; - unsigned int this_idle_cpus; - - /* Statistics of the busiest group */ - unsigned int busiest_idle_cpus; - unsigned long max_load; - unsigned long busiest_load_per_task; - unsigned long busiest_nr_running; - unsigned long busiest_group_capacity; - unsigned long busiest_has_capacity; - unsigned int busiest_group_weight; - - int group_imb; /* Is there imbalance in this sd */ -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -4269,6 +4239,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_nr_running; /* Nr tasks running in the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + unsigned long load_per_task; unsigned long group_capacity; unsigned long idle_cpus; unsigned long group_weight; @@ -4276,6 +4247,21 @@ struct sg_lb_stats { int group_has_capacity; /* Is there extra capacity in the group? */ }; +/* + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + struct sg_lb_stats local_stat; /* Statistics of the local group */ + struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ +}; + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. @@ -4490,6 +4476,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); + if (load > max_cpu_load) max_cpu_load = load; if (min_cpu_load > load) @@ -4531,10 +4518,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, (max_nr_running - min_nr_running) > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, - SCHED_POWER_SCALE); + sgs->group_capacity = + DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); + if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(env->sd, group); + sgs->group_weight = group->group_weight; if (sgs->group_capacity > sgs->sum_nr_running) @@ -4556,7 +4545,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->max_load) + if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; if (sgs->sum_nr_running > sgs->group_capacity) @@ -4593,7 +4582,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; - struct sg_lb_stats sgs; + struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; if (child && child->flags & SD_PREFER_SIBLING) @@ -4602,14 +4591,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, load_idx = get_sd_load_idx(env->sd, env->idle); do { + struct sg_lb_stats *sgs = &tmp_sgs; int local_group; local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); - memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(env, sg, load_idx, local_group, &sgs); + if (local_group) { + sds->local = sg; + sgs = &sds->local_stat; + } - sds->total_load += sgs.group_load; - sds->total_pwr += sg->sgp->power; + memset(sgs, 0, sizeof(*sgs)); + update_sg_lb_stats(env, sg, load_idx, local_group, sgs); /* * In case the child domain prefers tasks go to siblings @@ -4621,26 +4613,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && !local_group && sds->this_has_capacity) - sgs.group_capacity = min(sgs.group_capacity, 1UL); + if (prefer_sibling && !local_group && + sds->local && sds->local_stat.group_has_capacity) + sgs->group_capacity = min(sgs->group_capacity, 1UL); - if (local_group) { - sds->this_load = sgs.avg_load; - sds->this = sg; - sds->this_nr_running = sgs.sum_nr_running; - sds->this_load_per_task = sgs.sum_weighted_load; - sds->this_has_capacity = sgs.group_has_capacity; - sds->this_idle_cpus = sgs.idle_cpus; - } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { - sds->max_load = sgs.avg_load; + /* Now, start updating sd_lb_stats */ + sds->total_load += sgs->group_load; + sds->total_pwr += sg->sgp->power; + + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; - sds->busiest_nr_running = sgs.sum_nr_running; - sds->busiest_idle_cpus = sgs.idle_cpus; - sds->busiest_group_capacity = sgs.group_capacity; - sds->busiest_load_per_task = sgs.sum_weighted_load; - sds->busiest_has_capacity = sgs.group_has_capacity; - sds->busiest_group_weight = sgs.group_weight; - sds->group_imb = sgs.group_imb; + sds->busiest_stat = *sgs; } sg = sg->next; @@ -4684,8 +4667,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (env->dst_cpu > busiest_cpu) return 0; - env->imbalance = DIV_ROUND_CLOSEST( - sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); + env->imbalance = DIV_ROUND_CLOSEST(sds->busiest_stat.avg_load * + sds->busiest->sgp->power, SCHED_POWER_SCALE); return 1; } @@ -4703,24 +4686,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) unsigned long tmp, pwr_now = 0, pwr_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; + struct sg_lb_stats *local, *busiest; - if (sds->this_nr_running) { - sds->this_load_per_task /= sds->this_nr_running; - if (sds->busiest_load_per_task > - sds->this_load_per_task) - imbn = 1; - } else { - sds->this_load_per_task = - cpu_avg_load_per_task(env->dst_cpu); - } + local = &sds->local_stat; + busiest = &sds->busiest_stat; - scaled_busy_load_per_task = sds->busiest_load_per_task - * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->sgp->power; + if (!local->sum_nr_running) + local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); + else if (busiest->load_per_task > local->load_per_task) + imbn = 1; - if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= - (scaled_busy_load_per_task * imbn)) { - env->imbalance = sds->busiest_load_per_task; + scaled_busy_load_per_task = + (busiest->load_per_task * SCHED_POWER_SCALE) / + sds->busiest->sgp->power; + + if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= + (scaled_busy_load_per_task * imbn)) { + env->imbalance = busiest->load_per_task; return; } @@ -4731,33 +4713,36 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) */ pwr_now += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load); + min(busiest->load_per_task, busiest->avg_load); + pwr_now += sds->local->sgp->power * + min(local->load_per_task, local->avg_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / + tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / sds->busiest->sgp->power; - if (sds->max_load > tmp) + if (busiest->avg_load > tmp) { pwr_move += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load - tmp); + min(busiest->load_per_task, + busiest->avg_load - tmp); + } /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->sgp->power < - sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->sgp->power) / - sds->this->sgp->power; - else - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->sgp->power; - pwr_move += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load + tmp); + if (busiest->avg_load * sds->busiest->sgp->power < + busiest->load_per_task * SCHED_POWER_SCALE) { + tmp = (busiest->avg_load * sds->busiest->sgp->power) / + sds->local->sgp->power; + } else { + tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / + sds->local->sgp->power; + } + pwr_move += sds->local->sgp->power * + min(local->load_per_task, local->avg_load + tmp); pwr_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) - env->imbalance = sds->busiest_load_per_task; + env->imbalance = busiest->load_per_task; } /** @@ -4769,11 +4754,22 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { unsigned long max_pull, load_above_capacity = ~0UL; + struct sg_lb_stats *local, *busiest; - sds->busiest_load_per_task /= sds->busiest_nr_running; - if (sds->group_imb) { - sds->busiest_load_per_task = - min(sds->busiest_load_per_task, sds->avg_load); + local = &sds->local_stat; + if (local->sum_nr_running) { + local->load_per_task = + local->sum_weighted_load / local->sum_nr_running; + } + + busiest = &sds->busiest_stat; + /* busiest must have some tasks */ + busiest->load_per_task = + busiest->sum_weighted_load / busiest->sum_nr_running; + + if (busiest->group_imb) { + busiest->load_per_task = + min(busiest->load_per_task, sds->avg_load); } /* @@ -4781,20 +4777,19 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ - if (sds->max_load < sds->avg_load) { + if (busiest->avg_load < sds->avg_load) { env->imbalance = 0; return fix_small_imbalance(env, sds); } - if (!sds->group_imb) { + if (!busiest->group_imb) { /* * Don't want to pull so many tasks that a group would go idle. */ - load_above_capacity = (sds->busiest_nr_running - - sds->busiest_group_capacity); + load_above_capacity = + (busiest->sum_nr_running - busiest->group_capacity); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= sds->busiest->sgp->power; } @@ -4808,12 +4803,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * Be careful of negative numbers as they'll appear as very large values * with unsigned longs. */ - max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); + max_pull = min(busiest->avg_load - sds->avg_load, + load_above_capacity); /* How much load to actually move to equalise the imbalance */ - env->imbalance = min(max_pull * sds->busiest->sgp->power, - (sds->avg_load - sds->this_load) * sds->this->sgp->power) - / SCHED_POWER_SCALE; + env->imbalance = min( + max_pull * sds->busiest->sgp->power, + (sds->avg_load - local->avg_load) * sds->local->sgp->power + ) / SCHED_POWER_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -4821,9 +4818,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * a think about bumping its value to force at least one task to be * moved */ - if (env->imbalance < sds->busiest_load_per_task) + if (env->imbalance < busiest->load_per_task) return fix_small_imbalance(env, sds); - } /******* find_busiest_group() helpers end here *********************/ @@ -4845,9 +4841,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * return the least loaded group whose CPUs can be * put to idle by rebalancing its tasks onto our group. */ -static struct sched_group * -find_busiest_group(struct lb_env *env) +static struct sched_group *find_busiest_group(struct lb_env *env) { + struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; memset(&sds, 0, sizeof(sds)); @@ -4857,13 +4853,15 @@ find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + local = &sds.local_stat; + busiest = &sds.busiest_stat; if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || sds.busiest_nr_running == 0) + if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; @@ -4873,26 +4871,26 @@ find_busiest_group(struct lb_env *env) * work because they assumes all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (sds.group_imb) + if (busiest->group_imb) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && - !sds.busiest_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && + !busiest->group_has_capacity) goto force_balance; /* * If the local group is more busy than the selected busiest group * don't try and pull any tasks. */ - if (sds.this_load >= sds.max_load) + if (local->avg_load >= busiest->avg_load) goto out_balanced; /* * Don't pull any tasks if this group is already above the domain * average load. */ - if (sds.this_load >= sds.avg_load) + if (local->avg_load >= sds.avg_load) goto out_balanced; if (env->idle == CPU_IDLE) { @@ -4902,15 +4900,16 @@ find_busiest_group(struct lb_env *env) * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && - sds.busiest_nr_running <= sds.busiest_group_weight) + if ((local->idle_cpus < busiest->idle_cpus) && + busiest->sum_nr_running <= busiest->group_weight) goto out_balanced; } else { /* * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use * imbalance_pct to be conservative. */ - if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) + if (100 * busiest->avg_load <= + env->sd->imbalance_pct * local->avg_load) goto out_balanced; } From 147c5fc2bad780d8093b547f2baa204e78107faf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Aug 2013 15:22:57 +0200 Subject: [PATCH 11/16] sched/fair: Shrink sg_lb_stats and play memset games We can shrink sg_lb_stats because rq::nr_running is an unsigned int and cpu numbers are 'int' Before: sgs: /* size: 72, cachelines: 2, members: 10 */ sds: /* size: 184, cachelines: 3, members: 7 */ After: sgs: /* size: 56, cachelines: 1, members: 10 */ sds: /* size: 152, cachelines: 3, members: 7 */ Further we can avoid clearing all of sds since we do a total clear/assignment of sg_stats in update_sg_lb_stats() with exception of busiest_stat.avg_load which is referenced in update_sd_pick_busiest(). Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-0klzmz9okll8wc0nsudguc9p@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2da80a55827b..4c6a8a5a789a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4237,12 +4237,12 @@ static unsigned long task_h_load(struct task_struct *p) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_nr_running; /* Nr tasks running in the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; - unsigned long group_capacity; - unsigned long idle_cpus; - unsigned long group_weight; + unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned int group_capacity; + unsigned int idle_cpus; + unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ int group_has_capacity; /* Is there extra capacity in the group? */ }; @@ -4258,10 +4258,29 @@ struct sd_lb_stats { unsigned long total_pwr; /* Total power of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ - struct sg_lb_stats local_stat; /* Statistics of the local group */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ }; +static inline void init_sd_lb_stats(struct sd_lb_stats *sds) +{ + /* + * Skimp on the clearing to avoid duplicate work. We can avoid clearing + * local_stat because update_sg_lb_stats() does a full clear/assignment. + * We must however clear busiest_stat::avg_load because + * update_sd_pick_busiest() reads this before assignment. + */ + *sds = (struct sd_lb_stats){ + .busiest = NULL, + .local = NULL, + .total_load = 0UL, + .total_pwr = 0UL, + .busiest_stat = { + .avg_load = 0UL, + }, + }; +} + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. @@ -4615,7 +4634,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, */ if (prefer_sibling && !local_group && sds->local && sds->local_stat.group_has_capacity) - sgs->group_capacity = min(sgs->group_capacity, 1UL); + sgs->group_capacity = min(sgs->group_capacity, 1U); /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; @@ -4846,7 +4865,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; - memset(&sds, 0, sizeof(sds)); + init_sd_lb_stats(&sds); /* * Compute the various statistics relavent for load balancing at From 38d0f7708543bcfa03d5ee55e8346f801b4a59c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Aug 2013 19:47:56 +0200 Subject: [PATCH 12/16] sched/fair: Remove duplicate load_per_task computations Since we already compute (but don't store) the sgs load_per_task value in update_sg_lb_stats() we might as well store it and not re-compute it later on. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ym1vmljiwbzgdnnrwp9azftq@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c6a8a5a789a..57952198b01e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4476,7 +4476,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, { unsigned long nr_running, max_nr_running, min_nr_running; unsigned long load, max_cpu_load, min_cpu_load; - unsigned long avg_load_per_task = 0; int i; /* Tally up the load of all CPUs in the group */ @@ -4531,9 +4530,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, * the hierarchy? */ if (sgs->sum_nr_running) - avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && + if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task && (max_nr_running - min_nr_running) > 1) sgs->group_imb = 1; @@ -4776,15 +4775,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s struct sg_lb_stats *local, *busiest; local = &sds->local_stat; - if (local->sum_nr_running) { - local->load_per_task = - local->sum_weighted_load / local->sum_nr_running; - } - busiest = &sds->busiest_stat; - /* busiest must have some tasks */ - busiest->load_per_task = - busiest->sum_weighted_load / busiest->sum_nr_running; if (busiest->group_imb) { busiest->load_per_task = From 3ae11c90fd055ba1b1b03a014f851b395bdd26ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Aug 2013 20:37:48 +0200 Subject: [PATCH 13/16] sched/fair: Make group power more consistent For easier access, less dereferences and more consistent value, store the group power in update_sg_lb_stats() and use it thereafter. The actual value in sched_group::sched_group_power::power can change throughout the load-balance pass if we're unlucky. Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-739xxqkyvftrhnh9ncudutc7@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 57952198b01e..ccf20e76b6b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4239,6 +4239,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; + unsigned long group_power; unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int group_capacity; unsigned int idle_cpus; @@ -4518,7 +4519,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, update_group_power(env->sd, env->dst_cpu); /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; + sgs->group_power = group->sgp->power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; /* * Consider the group unbalanced when the imbalance is larger @@ -4537,7 +4539,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_imb = 1; sgs->group_capacity = - DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); + DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(env->sd, group); @@ -4637,7 +4639,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; - sds->total_pwr += sg->sgp->power; + sds->total_pwr += sgs->group_power; if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -4685,8 +4687,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (env->dst_cpu > busiest_cpu) return 0; - env->imbalance = DIV_ROUND_CLOSEST(sds->busiest_stat.avg_load * - sds->busiest->sgp->power, SCHED_POWER_SCALE); + env->imbalance = DIV_ROUND_CLOSEST( + sds->busiest_stat.avg_load * sds->busiest_stat.group_power, + SCHED_POWER_SCALE); return 1; } @@ -4716,7 +4719,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) scaled_busy_load_per_task = (busiest->load_per_task * SCHED_POWER_SCALE) / - sds->busiest->sgp->power; + busiest->group_power; if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { @@ -4730,32 +4733,32 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) * moving them. */ - pwr_now += sds->busiest->sgp->power * + pwr_now += busiest->group_power * min(busiest->load_per_task, busiest->avg_load); - pwr_now += sds->local->sgp->power * + pwr_now += local->group_power * min(local->load_per_task, local->avg_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - sds->busiest->sgp->power; + busiest->group_power; if (busiest->avg_load > tmp) { - pwr_move += sds->busiest->sgp->power * + pwr_move += busiest->group_power * min(busiest->load_per_task, busiest->avg_load - tmp); } /* Amount of load we'd add */ - if (busiest->avg_load * sds->busiest->sgp->power < + if (busiest->avg_load * busiest->group_power < busiest->load_per_task * SCHED_POWER_SCALE) { - tmp = (busiest->avg_load * sds->busiest->sgp->power) / - sds->local->sgp->power; + tmp = (busiest->avg_load * busiest->group_power) / + local->group_power; } else { tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - sds->local->sgp->power; + local->group_power; } - pwr_move += sds->local->sgp->power * - min(local->load_per_task, local->avg_load + tmp); + pwr_move += local->group_power * + min(local->load_per_task, local->avg_load + tmp); pwr_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ @@ -4800,7 +4803,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (busiest->sum_nr_running - busiest->group_capacity); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= sds->busiest->sgp->power; + load_above_capacity /= busiest->group_power; } /* @@ -4818,8 +4821,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* How much load to actually move to equalise the imbalance */ env->imbalance = min( - max_pull * sds->busiest->sgp->power, - (sds->avg_load - local->avg_load) * sds->local->sgp->power + max_pull * busiest->group_power, + (sds->avg_load - local->avg_load) * local->group_power ) / SCHED_POWER_SCALE; /* From 6906a40839198f33dbb56d20e644c01e00663952 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Aug 2013 15:20:21 +0200 Subject: [PATCH 14/16] sched/fair: Optimize find_busiest_queue() Use for_each_cpu_and() and thereby avoid computing the capacity for CPUs we know we're not interested in. Reviewed-by: Paul Turner Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-lppceyv6kb3a19g8spmrn20b@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ccf20e76b6b2..bedd30b168a5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4946,7 +4946,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_power = 1; int i; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { unsigned long power = power_of(i); unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); @@ -4955,9 +4955,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (!capacity) capacity = fix_small_capacity(env->sd, group); - if (!cpumask_test_cpu(i, env->cpus)) - continue; - rq = cpu_rq(i); wl = weighted_cpuload(i); From 30ce5dabc92b5a349a7d9e9cf499494d230e0691 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Aug 2013 20:29:29 +0200 Subject: [PATCH 15/16] sched/fair: Rework and comment the group_imb code Rik reported some weirdness due to the group_imb code. As a start to looking at it, clean it up a little and add a few explanatory comments. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-caeeqttnla4wrrmhp5uf89gp@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 123 ++++++++++++++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 34 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bedd30b168a5..dffb27070ddb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4463,6 +4463,81 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) return 0; } +/* + * Group imbalance indicates (and tries to solve) the problem where balancing + * groups is inadequate due to tsk_cpus_allowed() constraints. + * + * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a + * cpumask covering 1 cpu of the first group and 3 cpus of the second group. + * Something like: + * + * { 0 1 2 3 } { 4 5 6 7 } + * * * * * + * + * If we were to balance group-wise we'd place two tasks in the first group and + * two tasks in the second group. Clearly this is undesired as it will overload + * cpu 3 and leave one of the cpus in the second group unused. + * + * The current solution to this issue is detecting the skew in the first group + * by noticing it has a cpu that is overloaded while the remaining cpus are + * idle -- or rather, there's a distinct imbalance in the cpus; see + * sg_imbalanced(). + * + * When this is so detected; this group becomes a candidate for busiest; see + * update_sd_pick_busiest(). And calculcate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditional to allow it + * to create an effective group imbalance. + * + * This is a somewhat tricky proposition since the next run might not find the + * group imbalance and decide the groups need to be balanced again. A most + * subtle and fragile situation. + */ + +struct sg_imb_stats { + unsigned long max_nr_running, min_nr_running; + unsigned long max_cpu_load, min_cpu_load; +}; + +static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) +{ + sgi->max_cpu_load = sgi->max_nr_running = 0UL; + sgi->min_cpu_load = sgi->min_nr_running = ~0UL; +} + +static inline void +update_sg_imb_stats(struct sg_imb_stats *sgi, + unsigned long load, unsigned long nr_running) +{ + if (load > sgi->max_cpu_load) + sgi->max_cpu_load = load; + if (sgi->min_cpu_load > load) + sgi->min_cpu_load = load; + + if (nr_running > sgi->max_nr_running) + sgi->max_nr_running = nr_running; + if (sgi->min_nr_running > nr_running) + sgi->min_nr_running = nr_running; +} + +static inline int +sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) +{ + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of a task. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && + (sgi->max_nr_running - sgi->min_nr_running) > 1) + return 1; + + return 0; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -4475,15 +4550,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - unsigned long nr_running, max_nr_running, min_nr_running; - unsigned long load, max_cpu_load, min_cpu_load; + struct sg_imb_stats sgi; + unsigned long nr_running; + unsigned long load; int i; - /* Tally up the load of all CPUs in the group */ - max_cpu_load = 0; - min_cpu_load = ~0UL; - max_nr_running = 0; - min_nr_running = ~0UL; + init_sg_imb_stats(&sgi); for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -4495,16 +4567,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - - if (load > max_cpu_load) - max_cpu_load = load; - if (min_cpu_load > load) - min_cpu_load = load; - - if (nr_running > max_nr_running) - max_nr_running = nr_running; - if (min_nr_running > nr_running) - min_nr_running = nr_running; + update_sg_imb_stats(&sgi, load, nr_running); } sgs->group_load += load; @@ -4522,21 +4585,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_power = group->sgp->power; sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task && - (max_nr_running - min_nr_running) > 1) - sgs->group_imb = 1; + sgs->group_imb = sg_imbalanced(sgs, &sgi); sgs->group_capacity = DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); @@ -4781,6 +4833,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s busiest = &sds->busiest_stat; if (busiest->group_imb) { + /* + * In the group_imb case we cannot rely on group-wide averages + * to ensure cpu-load equilibrium, look at wider averages. XXX + */ busiest->load_per_task = min(busiest->load_per_task, sds->avg_load); } @@ -4798,6 +4854,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s if (!busiest->group_imb) { /* * Don't want to pull so many tasks that a group would go idle. + * Except of course for the group_imb case, since then we might + * have to drop below capacity to reach cpu-load equilibrium. */ load_above_capacity = (busiest->sum_nr_running - busiest->group_capacity); @@ -4813,11 +4871,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * we also don't want to reduce the group load below the group capacity * (so that we can implement power-savings policies etc). Thus we look * for the minimum possible imbalance. - * Be careful of negative numbers as they'll appear as very large values - * with unsigned longs. */ - max_pull = min(busiest->avg_load - sds->avg_load, - load_above_capacity); + max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ env->imbalance = min( @@ -4881,7 +4936,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) /* * If the busiest group is imbalanced the below checks don't - * work because they assumes all things are equal, which typically + * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ if (busiest->group_imb) From 10866e62e8a6907d9072f10f9a0561db0c0cf50b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Aug 2013 16:57:04 +0200 Subject: [PATCH 16/16] sched/fair: Fix the sd_parent_degenerate() code I found that on my WSM box I had a redundant domain: [ 0.949769] CPU0 attaching sched-domain: [ 0.953765] domain 0: span 0,12 level SIBLING [ 0.958335] groups: 0 (cpu_power = 587) 12 (cpu_power = 588) [ 0.964548] domain 1: span 0-5,12-17 level MC [ 0.969206] groups: 0,12 (cpu_power = 1175) 1,13 (cpu_power = 1176) 2,14 (cpu_power = 1176) 3,15 (cpu_power = 1176) 4,16 (cpu_power = 1176) 5,17 (cpu_power = 1176) [ 0.984993] domain 2: span 0-5,12-17 level CPU [ 0.989822] groups: 0-5,12-17 (cpu_power = 7055) [ 0.995049] domain 3: span 0-23 level NUMA [ 0.999620] groups: 0-5,12-17 (cpu_power = 7055) 6-11,18-23 (cpu_power = 7056) Note how domain 2 has only a single group and spans the same CPUs as domain 1. We should not keep such domains and do in fact have code to prune these. It turns out that the 'new' SD_PREFER_SIBLING flag causes this, it makes sd_parent_degenerate() fail on the CPU domain. We can easily fix this by 'ignoring' the SD_PREFER_SIBLING bit and transfering it to whatever domain ends up covering the span. With this patch the domains now look like this: [ 0.950419] CPU0 attaching sched-domain: [ 0.954454] domain 0: span 0,12 level SIBLING [ 0.959039] groups: 0 (cpu_power = 587) 12 (cpu_power = 588) [ 0.965271] domain 1: span 0-5,12-17 level MC [ 0.969936] groups: 0,12 (cpu_power = 1175) 1,13 (cpu_power = 1176) 2,14 (cpu_power = 1176) 3,15 (cpu_power = 1176) 4,16 (cpu_power = 1176) 5,17 (cpu_power = 1176) [ 0.985737] domain 2: span 0-23 level NUMA [ 0.990231] groups: 0-5,12-17 (cpu_power = 7055) 6-11,18-23 (cpu_power = 7056) Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ys201g4jwukj0h8xcamakxq1@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf8f100433e0..4da0f4bb2ca4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4914,7 +4914,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES); + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5118,6 +5119,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; + /* + * Transfer SD_PREFER_SIBLING down in case of a + * degenerate parent; the spans match for this + * so the property transfers. + */ + if (parent->flags & SD_PREFER_SIBLING) + tmp->flags |= SD_PREFER_SIBLING; destroy_sched_domain(parent, cpu); } else tmp = tmp->parent;