From 8cb68b343a66cf19834472012590490d34d31703 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Feb 2017 16:55:06 +0100 Subject: [PATCH 1/3] sched/core: Fix update_rq_clock() splat on hotplug (and suspend/resume) The hotplug code still triggers the warning about using a stale rq->clock value. Fix things up to actually run update_rq_clock() in a place where we record the 'UPDATED' flag, and then modify the annotation to retain this flag over the rq->lock fiddling that happens as a result of actually migrating all the tasks elsewhere. Reported-by: Linus Torvalds Tested-by: Mike Galbraith Tested-by: Sachin Sant Tested-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Matt Fleming Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Ross Zwisler Cc: Thomas Gleixner Fixes: 4d25b35ea372 ("sched/fair: Restore previous rq_flags when migrating tasks in hotplug") Link: http://lkml.kernel.org/r/20170202155506.GX6515@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 34e2291a9a6c..2d6e828a0471 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5557,7 +5557,7 @@ static void migrate_tasks(struct rq *dead_rq) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct rq_flags rf, old_rf; + struct rq_flags rf; int dest_cpu; /* @@ -5576,7 +5576,9 @@ static void migrate_tasks(struct rq *dead_rq) * class method both need to have an up-to-date * value of rq->clock[_task] */ + rq_pin_lock(rq, &rf); update_rq_clock(rq); + rq_unpin_lock(rq, &rf); for (;;) { /* @@ -5589,7 +5591,7 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task() assumes pinned rq->lock: */ - rq_pin_lock(rq, &rf); + rq_repin_lock(rq, &rf); next = pick_next_task(rq, &fake_task, &rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5618,13 +5620,6 @@ static void migrate_tasks(struct rq *dead_rq) continue; } - /* - * __migrate_task() may return with a different - * rq->lock held and a new cookie in 'rf', but we need - * to preserve rf::clock_update_flags for 'dead_rq'. - */ - old_rf = rf; - /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); @@ -5633,7 +5628,6 @@ static void migrate_tasks(struct rq *dead_rq) raw_spin_unlock(&rq->lock); rq = dead_rq; raw_spin_lock(&rq->lock); - rf = old_rf; } raw_spin_unlock(&next->pi_lock); } From a499c3ead88ccf147fc50689e85a530ad923ce36 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 21 Feb 2017 23:52:55 -0800 Subject: [PATCH 2/3] sched/fair: Update rq clock before changing a task's CPU affinity This is triggered during boot when CONFIG_SCHED_DEBUG is enabled: ------------[ cut here ]------------ WARNING: CPU: 6 PID: 81 at kernel/sched/sched.h:812 set_next_entity+0x11d/0x380 rq->clock_update_flags < RQCF_ACT_SKIP CPU: 6 PID: 81 Comm: torture_shuffle Not tainted 4.10.0+ #1 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016 Call Trace: dump_stack+0x85/0xc2 __warn+0xcb/0xf0 warn_slowpath_fmt+0x5f/0x80 set_next_entity+0x11d/0x380 set_curr_task_fair+0x2b/0x60 do_set_cpus_allowed+0x139/0x180 __set_cpus_allowed_ptr+0x113/0x260 set_cpus_allowed_ptr+0x10/0x20 torture_shuffle+0xfd/0x180 kthread+0x10f/0x150 ? torture_shutdown_init+0x60/0x60 ? kthread_create_on_node+0x60/0x60 ret_from_fork+0x31/0x40 ---[ end trace dd94d92344cea9c6 ]--- The task is running && !queued, so there is no rq clock update before calling set_curr_task(). This patch fixes it by updating rq clock after holding rq->lock/pi_lock just as what other dequeue + put_prev + enqueue + set_curr story does. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Matt Fleming Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1487749975-5994-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d6e828a0471..cc1e3e0d29b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1087,6 +1087,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, int ret = 0; rq = task_rq_lock(p, &rf); + update_rq_clock(rq); if (p->flags & PF_KTHREAD) { /* From 96b777452d8881480fd5be50112f791c17db4b6b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 8 Feb 2017 14:27:27 +0300 Subject: [PATCH 3/3] sched/cgroup: Move sched_online_group() back into css_online() to fix crash Commit: 2f5177f0fd7e ("sched/cgroup: Fix/cleanup cgroup teardown/init") .. moved sched_online_group() from css_online() to css_alloc(). It exposes half-baked task group into global lists before initializing generic cgroup stuff. LTP testcase (third in cgroup_regression_test) written for testing similar race in kernels 2.6.26-2.6.28 easily triggers this oops: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: kernfs_path_from_node_locked+0x260/0x320 CPU: 1 PID: 30346 Comm: cat Not tainted 4.10.0-rc5-test #4 Call Trace: ? kernfs_path_from_node+0x4f/0x60 kernfs_path_from_node+0x3e/0x60 print_rt_rq+0x44/0x2b0 print_rt_stats+0x7a/0xd0 print_cpu+0x2fc/0xe80 ? __might_sleep+0x4a/0x80 sched_debug_show+0x17/0x30 seq_read+0xf2/0x3b0 proc_reg_read+0x42/0x70 __vfs_read+0x28/0x130 ? security_file_permission+0x9b/0xc0 ? rw_verify_area+0x4e/0xb0 vfs_read+0xa5/0x170 SyS_read+0x46/0xa0 entry_SYSCALL_64_fastpath+0x1e/0xad Here the task group is already linked into the global RCU-protected 'task_groups' list, but the css->cgroup pointer is still NULL. This patch reverts this chunk and moves online back to css_online(). Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Fixes: 2f5177f0fd7e ("sched/cgroup: Fix/cleanup cgroup teardown/init") Link: http://lkml.kernel.org/r/148655324740.424917.5302984537258726349.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cc1e3e0d29b0..e01bd807f0af 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6811,11 +6811,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - sched_online_group(tg, parent); - return &tg->css; } +/* Expose task group only after completing cgroup initialization */ +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); + + if (parent) + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -7221,6 +7230,7 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .fork = cpu_cgroup_fork,