From cf736ea6f902c26e03895dc7f5ccbc55cdc68e6e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 4 Aug 2015 09:33:40 -0700 Subject: [PATCH 1/8] thermal: power_allocator: do not use devm* interfaces The code in question is called outside of standard driver probe()/remove() callbacks and thus will not benefit from use of devm* infrastructure. Signed-off-by: Dmitry Torokhov Signed-off-by: Eduardo Valentin --- drivers/thermal/power_allocator.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/power_allocator.c b/drivers/thermal/power_allocator.c index 63a448f9d93b..7006860f2f36 100644 --- a/drivers/thermal/power_allocator.c +++ b/drivers/thermal/power_allocator.c @@ -334,7 +334,7 @@ static int allocate_power(struct thermal_zone_device *tz, max_allocatable_power, current_temp, (s32)control_temp - (s32)current_temp); - devm_kfree(&tz->device, req_power); + kfree(req_power); unlock: mutex_unlock(&tz->lock); @@ -426,7 +426,7 @@ static int power_allocator_bind(struct thermal_zone_device *tz) return -EINVAL; } - params = devm_kzalloc(&tz->device, sizeof(*params), GFP_KERNEL); + params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) return -ENOMEM; @@ -468,14 +468,14 @@ static int power_allocator_bind(struct thermal_zone_device *tz) return 0; free: - devm_kfree(&tz->device, params); + kfree(params); return ret; } static void power_allocator_unbind(struct thermal_zone_device *tz) { dev_dbg(&tz->device, "Unbinding from thermal zone %d\n", tz->id); - devm_kfree(&tz->device, tz->governor_data); + kfree(tz->governor_data); tz->governor_data = NULL; } From 02373d7c69b4270bbab930f8a81b0721be794347 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 12 Aug 2015 15:22:16 +0530 Subject: [PATCH 2/8] thermal: cpu_cooling: fix lockdep problems in cpu_cooling A recent change to the cpu_cooling code introduced a AB-BA deadlock scenario between the cpufreq_policy_notifier_list rwsem and the cooling_cpufreq_lock. This is caused by cooling_cpufreq_lock being held before the registration/removal of the notifier block (an operation which takes the rwsem), and the notifier code itself which takes the locks in the reverse order: ====================================================== [ INFO: possible circular locking dependency detected ] 3.18.0+ #1453 Not tainted ------------------------------------------------------- rc.local/770 is trying to acquire lock: (cooling_cpufreq_lock){+.+.+.}, at: [] cpufreq_thermal_notifier+0x34/0xfc but task is already holding lock: ((cpufreq_policy_notifier_list).rwsem){++++.+}, at: [] __blocking_notifier_call_chain+0x34/0x68 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 ((cpufreq_policy_notifier_list).rwsem){++++.+}: [] down_write+0x44/0x9c [] blocking_notifier_chain_register+0x28/0xd8 [] cpufreq_register_notifier+0x68/0x90 [] __cpufreq_cooling_register.part.1+0x120/0x180 [] __cpufreq_cooling_register+0x98/0xa4 [] cpufreq_cooling_register+0x18/0x1c [] imx_thermal_probe+0x1c0/0x470 [imx_thermal] [] platform_drv_probe+0x50/0xac [] driver_probe_device+0x114/0x234 [] __driver_attach+0x9c/0xa0 [] bus_for_each_dev+0x5c/0x90 [] driver_attach+0x24/0x28 [] bus_add_driver+0xe0/0x1d8 [] driver_register+0x80/0xfc [] __platform_driver_register+0x50/0x64 [] 0xbf007018 [] do_one_initcall+0x88/0x1d8 [] load_module+0x1768/0x1ef8 [] SyS_init_module+0xe0/0xf4 [] ret_fast_syscall+0x0/0x48 -> #0 (cooling_cpufreq_lock){+.+.+.}: [] lock_acquire+0xb0/0x124 [] mutex_lock_nested+0x5c/0x3d8 [] cpufreq_thermal_notifier+0x34/0xfc [] notifier_call_chain+0x4c/0x8c [] __blocking_notifier_call_chain+0x50/0x68 [] blocking_notifier_call_chain+0x20/0x28 [] cpufreq_set_policy+0x7c/0x1d0 [] store_scaling_governor+0x74/0x9c [] store+0x90/0xc0 [] sysfs_kf_write+0x54/0x58 [] kernfs_fop_write+0xdc/0x190 [] vfs_write+0xac/0x1b4 [] SyS_write+0x44/0x90 [] ret_fast_syscall+0x0/0x48 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock((cpufreq_policy_notifier_list).rwsem); lock(cooling_cpufreq_lock); lock((cpufreq_policy_notifier_list).rwsem); lock(cooling_cpufreq_lock); *** DEADLOCK *** 7 locks held by rc.local/770: #0: (sb_writers#6){.+.+.+}, at: [] vfs_write+0x18c/0x1b4 #1: (&of->mutex){+.+.+.}, at: [] kernfs_fop_write+0xa0/0x190 #2: (s_active#52){.+.+.+}, at: [] kernfs_fop_write+0xa8/0x190 #3: (cpu_hotplug.lock){++++++}, at: [] get_online_cpus+0x34/0x90 #4: (cpufreq_rwsem){.+.+.+}, at: [] store+0x58/0xc0 #5: (&policy->rwsem){+.+.+.}, at: [] store+0x70/0xc0 #6: ((cpufreq_policy_notifier_list).rwsem){++++.+}, at: [] __blocking_notifier_call_chain+0x34/0x68 stack backtrace: CPU: 0 PID: 770 Comm: rc.local Not tainted 3.18.0+ #1453 Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) Backtrace: [] (dump_backtrace) from [] (show_stack+0x18/0x1c) r6:c0b85a80 r5:c0b75630 r4:00000000 r3:00000000 [] (show_stack) from [] (dump_stack+0x7c/0x98) [] (dump_stack) from [] (print_circular_bug+0x28c/0x2d8) r4:c0b85a80 r3:d0071d40 [] (print_circular_bug) from [] (__lock_acquire+0x1acc/0x1bb0) r10:c0b50660 r8:c09e6d80 r7:d0071d40 r6:c11d0f0c r5:00000007 r4:d0072240 [] (__lock_acquire) from [] (lock_acquire+0xb0/0x124) r10:00000000 r9:c04abfc4 r8:00000000 r7:00000000 r6:00000000 r5:c0a06f0c r4:00000000 [] (lock_acquire) from [] (mutex_lock_nested+0x5c/0x3d8) r10:ec853800 r9:c0a06ed4 r8:d0071d40 r7:c0a06ed4 r6:c11d0f0c r5:00000000 r4:c04abfc4 [] (mutex_lock_nested) from [] (cpufreq_thermal_notifier+0x34/0xfc) r10:ec853800 r9:ec85380c r8:d00d7d3c r7:c0a06ed4 r6:d00d7d3c r5:00000000 r4:fffffffe [] (cpufreq_thermal_notifier) from [] (notifier_call_chain+0x4c/0x8c) r7:00000000 r6:00000000 r5:00000000 r4:fffffffe [] (notifier_call_chain) from [] (__blocking_notifier_call_chain+0x50/0x68) r8:c0a072a4 r7:00000000 r6:d00d7d3c r5:ffffffff r4:c0a06fc8 r3:ffffffff [] (__blocking_notifier_call_chain) from [] (blocking_notifier_call_chain+0x20/0x28) r7:ec98b540 r6:c13ebc80 r5:ed76e600 r4:d00d7d3c [] (blocking_notifier_call_chain) from [] (cpufreq_set_policy+0x7c/0x1d0) [] (cpufreq_set_policy) from [] (store_scaling_governor+0x74/0x9c) r7:ec98b540 r6:0000000c r5:ec98b540 r4:ed76e600 [] (store_scaling_governor) from [] (store+0x90/0xc0) r6:0000000c r5:ed76e6d4 r4:ed76e600 [] (store) from [] (sysfs_kf_write+0x54/0x58) r8:0000000c r7:d00d7f78 r6:ec98b540 r5:0000000c r4:ec853800 r3:0000000c [] (sysfs_kf_write) from [] (kernfs_fop_write+0xdc/0x190) r6:ec98b540 r5:00000000 r4:00000000 r3:c0175330 [] (kernfs_fop_write) from [] (vfs_write+0xac/0x1b4) r10:0162aa70 r9:d00d6000 r8:0000000c r7:d00d7f78 r6:0162aa70 r5:0000000c r4:eccde500 [] (vfs_write) from [] (SyS_write+0x44/0x90) r10:0162aa70 r8:0000000c r7:eccde500 r6:eccde500 r5:00000000 r4:00000000 [] (SyS_write) from [] (ret_fast_syscall+0x0/0x48) r10:00000000 r8:c000edc4 r7:00000004 r6:000216cc r5:0000000c r4:0162aa70 Solve this by moving to finer grained locking - use one mutex to protect the cpufreq_dev_list as a whole, and a separate lock to ensure correct ordering of cpufreq notifier registration and removal. cooling_list_lock is taken within cooling_cpufreq_lock on (un)registration to preserve the behavior of the code, i.e. to atomically add/remove to the list and (un)register the notifier. Fixes: 2dcd851fe4b4 ("thermal: cpu_cooling: Update always cpufreq policy with Reviewed-by: Viresh Kumar Signed-off-by: Russell King Signed-off-by: Viresh Kumar Signed-off-by: Eduardo Valentin --- drivers/thermal/cpu_cooling.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index 6509c61b9648..5ae0524bed19 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -107,6 +107,9 @@ struct cpufreq_cooling_device { static DEFINE_IDR(cpufreq_idr); static DEFINE_MUTEX(cooling_cpufreq_lock); +static unsigned int cpufreq_dev_count; + +static DEFINE_MUTEX(cooling_list_lock); static LIST_HEAD(cpufreq_dev_list); /** @@ -185,14 +188,14 @@ unsigned long cpufreq_cooling_get_level(unsigned int cpu, unsigned int freq) { struct cpufreq_cooling_device *cpufreq_dev; - mutex_lock(&cooling_cpufreq_lock); + mutex_lock(&cooling_list_lock); list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { if (cpumask_test_cpu(cpu, &cpufreq_dev->allowed_cpus)) { - mutex_unlock(&cooling_cpufreq_lock); + mutex_unlock(&cooling_list_lock); return get_level(cpufreq_dev, freq); } } - mutex_unlock(&cooling_cpufreq_lock); + mutex_unlock(&cooling_list_lock); pr_err("%s: cpu:%d not part of any cooling device\n", __func__, cpu); return THERMAL_CSTATE_INVALID; @@ -221,7 +224,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, switch (event) { case CPUFREQ_ADJUST: - mutex_lock(&cooling_cpufreq_lock); + mutex_lock(&cooling_list_lock); list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { if (!cpumask_test_cpu(policy->cpu, &cpufreq_dev->allowed_cpus)) @@ -233,7 +236,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, cpufreq_verify_within_limits(policy, 0, max_freq); } - mutex_unlock(&cooling_cpufreq_lock); + mutex_unlock(&cooling_list_lock); break; default: return NOTIFY_DONE; @@ -866,12 +869,14 @@ __cpufreq_cooling_register(struct device_node *np, mutex_lock(&cooling_cpufreq_lock); + mutex_lock(&cooling_list_lock); + list_add(&cpufreq_dev->node, &cpufreq_dev_list); + mutex_unlock(&cooling_list_lock); + /* Register the notifier for first cpufreq cooling device */ - if (list_empty(&cpufreq_dev_list)) + if (!cpufreq_dev_count++) cpufreq_register_notifier(&thermal_cpufreq_notifier_block, CPUFREQ_POLICY_NOTIFIER); - list_add(&cpufreq_dev->node, &cpufreq_dev_list); - mutex_unlock(&cooling_cpufreq_lock); return cool_dev; @@ -1013,13 +1018,17 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) return; cpufreq_dev = cdev->devdata; - mutex_lock(&cooling_cpufreq_lock); - list_del(&cpufreq_dev->node); /* Unregister the notifier for the last cpufreq cooling device */ - if (list_empty(&cpufreq_dev_list)) + mutex_lock(&cooling_cpufreq_lock); + if (!--cpufreq_dev_count) cpufreq_unregister_notifier(&thermal_cpufreq_notifier_block, CPUFREQ_POLICY_NOTIFIER); + + mutex_lock(&cooling_list_lock); + list_del(&cpufreq_dev->node); + mutex_unlock(&cooling_list_lock); + mutex_unlock(&cooling_cpufreq_lock); thermal_cooling_device_unregister(cpufreq_dev->cool_dev); From 76fd38ce21de506a3867768fac42729eb6d7dedf Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 30 Jul 2015 12:40:30 +0530 Subject: [PATCH 3/8] thermal/cpu_cooling: No need to initialize max_freq to 0 Its always set before getting used, don't initialize it. Signed-off-by: Viresh Kumar Signed-off-by: Eduardo Valentin --- drivers/thermal/cpu_cooling.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index 5ae0524bed19..c7572dfc927a 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -218,7 +218,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, unsigned long event, void *data) { struct cpufreq_policy *policy = data; - unsigned long max_freq = 0; + unsigned long max_freq; struct cpufreq_cooling_device *cpufreq_dev; switch (event) { From 166529c9b6f91b97d771e2e7ebf748aadb239b44 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 30 Jul 2015 12:40:31 +0530 Subject: [PATCH 4/8] thermal/cpu_cooling: quit early after updating policy If a valid cpufreq_dev is found for policy->cpu, we should update the policy and quit the for loop. There is no need to keep traversing the list of cpufreq_dev's. Signed-off-by: Viresh Kumar Signed-off-by: Eduardo Valentin --- drivers/thermal/cpu_cooling.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index c7572dfc927a..093537f00db3 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -235,6 +235,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, if (policy->max != max_freq) cpufreq_verify_within_limits(policy, 0, max_freq); + break; } mutex_unlock(&cooling_list_lock); break; From a24af233a1fd09002cabc05d6da248cc5656a2e1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 30 Jul 2015 12:40:32 +0530 Subject: [PATCH 5/8] thermal/cpu_cooling: convert 'switch' block to 'if' block in notifier We just need to take care of single event here and there is no need to increase indentation level of most of the code (which causes lines longer that 80 columns to break). Kill the switch block. Signed-off-by: Viresh Kumar Signed-off-by: Eduardo Valentin --- drivers/thermal/cpu_cooling.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index 093537f00db3..1cf897cd993c 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -221,27 +221,21 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, unsigned long max_freq; struct cpufreq_cooling_device *cpufreq_dev; - switch (event) { - - case CPUFREQ_ADJUST: - mutex_lock(&cooling_list_lock); - list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { - if (!cpumask_test_cpu(policy->cpu, - &cpufreq_dev->allowed_cpus)) - continue; - - max_freq = cpufreq_dev->cpufreq_val; - - if (policy->max != max_freq) - cpufreq_verify_within_limits(policy, 0, - max_freq); - break; - } - mutex_unlock(&cooling_list_lock); - break; - default: + if (event != CPUFREQ_ADJUST) return NOTIFY_DONE; + + mutex_lock(&cooling_list_lock); + list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { + if (!cpumask_test_cpu(policy->cpu, &cpufreq_dev->allowed_cpus)) + continue; + + max_freq = cpufreq_dev->cpufreq_val; + + if (policy->max != max_freq) + cpufreq_verify_within_limits(policy, 0, max_freq); + break; } + mutex_unlock(&cooling_list_lock); return NOTIFY_OK; } From 59f0d21883f39d27f14408d4ca211dce80658963 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 30 Jul 2015 12:40:33 +0530 Subject: [PATCH 6/8] thermal/cpu_cooling: rename cpufreq_val as clipped_freq That's what it is for, lets name it properly. Signed-off-by: Viresh Kumar Signed-off-by: Eduardo Valentin --- drivers/thermal/cpu_cooling.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index 1cf897cd993c..9c146229738e 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -68,7 +68,7 @@ struct power_table { * registered cooling device. * @cpufreq_state: integer value representing the current state of cpufreq * cooling devices. - * @cpufreq_val: integer value representing the absolute value of the clipped + * @clipped_freq: integer value representing the absolute value of the clipped * frequency. * @max_level: maximum cooling level. One less than total number of valid * cpufreq frequencies. @@ -91,7 +91,7 @@ struct cpufreq_cooling_device { int id; struct thermal_cooling_device *cool_dev; unsigned int cpufreq_state; - unsigned int cpufreq_val; + unsigned int clipped_freq; unsigned int max_level; unsigned int *freq_table; /* In descending order */ struct cpumask allowed_cpus; @@ -229,7 +229,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, if (!cpumask_test_cpu(policy->cpu, &cpufreq_dev->allowed_cpus)) continue; - max_freq = cpufreq_dev->cpufreq_val; + max_freq = cpufreq_dev->clipped_freq; if (policy->max != max_freq) cpufreq_verify_within_limits(policy, 0, max_freq); @@ -517,7 +517,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, clip_freq = cpufreq_device->freq_table[state]; cpufreq_device->cpufreq_state = state; - cpufreq_device->cpufreq_val = clip_freq; + cpufreq_device->clipped_freq = clip_freq; cpufreq_update_policy(cpu); @@ -859,7 +859,7 @@ __cpufreq_cooling_register(struct device_node *np, pr_debug("%s: freq:%u KHz\n", __func__, freq); } - cpufreq_dev->cpufreq_val = cpufreq_dev->freq_table[0]; + cpufreq_dev->clipped_freq = cpufreq_dev->freq_table[0]; cpufreq_dev->cool_dev = cool_dev; mutex_lock(&cooling_cpufreq_lock); From abcbcc25cb3edfc3c9af210a88c9386e353191fe Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 30 Jul 2015 12:40:34 +0530 Subject: [PATCH 7/8] thermal/cpu_cooling: rename max_freq as clipped_freq in notifier That's what it is for, lets name it properly. Signed-off-by: Viresh Kumar Signed-off-by: Eduardo Valentin --- drivers/thermal/cpu_cooling.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index 9c146229738e..71dbede9edaa 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -218,7 +218,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, unsigned long event, void *data) { struct cpufreq_policy *policy = data; - unsigned long max_freq; + unsigned long clipped_freq; struct cpufreq_cooling_device *cpufreq_dev; if (event != CPUFREQ_ADJUST) @@ -229,10 +229,10 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, if (!cpumask_test_cpu(policy->cpu, &cpufreq_dev->allowed_cpus)) continue; - max_freq = cpufreq_dev->clipped_freq; + clipped_freq = cpufreq_dev->clipped_freq; - if (policy->max != max_freq) - cpufreq_verify_within_limits(policy, 0, max_freq); + if (policy->max != clipped_freq) + cpufreq_verify_within_limits(policy, 0, clipped_freq); break; } mutex_unlock(&cooling_list_lock); From 1afb9c539daebc2c8a7b33d0e0b8fc9f74671b02 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 30 Jul 2015 12:40:35 +0530 Subject: [PATCH 8/8] thermal/cpu_cooling: update policy limits if clipped_freq < policy->max policy->max is the maximum allowed frequency defined by user and clipped_freq is the maximum that thermal constraints allow. If clipped_freq is lower than policy->max, then we need to readjust policy->max. But, if clipped_freq is greater than policy->max, we don't need to do anything. We used to call cpufreq_verify_within_limits() in this case, but it doesn't change anything in this case. Lets skip this unnecessary call and write a comment that explains this. Signed-off-by: Viresh Kumar Signed-off-by: Eduardo Valentin --- drivers/thermal/cpu_cooling.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index 71dbede9edaa..620dcd405ff6 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -229,9 +229,20 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, if (!cpumask_test_cpu(policy->cpu, &cpufreq_dev->allowed_cpus)) continue; + /* + * policy->max is the maximum allowed frequency defined by user + * and clipped_freq is the maximum that thermal constraints + * allow. + * + * If clipped_freq is lower than policy->max, then we need to + * readjust policy->max. + * + * But, if clipped_freq is greater than policy->max, we don't + * need to do anything. + */ clipped_freq = cpufreq_dev->clipped_freq; - if (policy->max != clipped_freq) + if (policy->max > clipped_freq) cpufreq_verify_within_limits(policy, 0, clipped_freq); break; }