From 1c6a662f898ecd1615d25fecb8098ea646720a7a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Dec 2014 09:45:31 +0530 Subject: [PATCH 1/9] PM / OPP: replace kfree with kfree_rcu while freeing 'struct device_opp' Somehow one of the instance of freeing resources failed to use kfree_rcu() and used kfree() instead. This might cause problems as the node might be referenced by readers under rcu locks and we must wait for the rcu grace period as well. While we are at it, also update comment over 'struct device_opp' to mention why we are waiting for both rcu and srcu grace periods. Fixes: 129eec55df6a (PM / OPP Introduce APIs to remove OPPs) Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index 1bbef8e838e7..e1807268cbf2 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -84,7 +84,11 @@ struct dev_pm_opp { * * This is an internal data structure maintaining the link to opps attached to * a device. This structure is not meant to be shared to users as it is - * meant for book keeping and private to OPP library + * meant for book keeping and private to OPP library. + * + * Because the opp structures can be used from both rcu and srcu readers, we + * need to wait for the grace period of both of them before freeing any + * resources. And so we have used kfree_rcu() from within call_srcu() handlers. */ struct device_opp { struct list_head node; @@ -511,7 +515,7 @@ static void kfree_device_rcu(struct rcu_head *head) { struct device_opp *device_opp = container_of(head, struct device_opp, rcu_head); - kfree(device_opp); + kfree_rcu(device_opp, rcu_head); } void __dev_pm_opp_remove(struct device_opp *dev_opp, struct dev_pm_opp *opp) From 86453b473b1f68c238a6901b26158b4ca3b369bc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Dec 2014 09:45:32 +0530 Subject: [PATCH 2/9] PM / OPP: Staticize __dev_pm_opp_remove() Its a local routine and need not be accessible outside of opp.c. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index e1807268cbf2..fa065d6e1731 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -518,7 +518,8 @@ static void kfree_device_rcu(struct rcu_head *head) kfree_rcu(device_opp, rcu_head); } -void __dev_pm_opp_remove(struct device_opp *dev_opp, struct dev_pm_opp *opp) +static void __dev_pm_opp_remove(struct device_opp *dev_opp, + struct dev_pm_opp *opp) { /* * Notify the changes in the availability of the operable From 29df0ee1b14ab5cdc83c225258f42600825f45b2 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Dec 2014 09:45:33 +0530 Subject: [PATCH 3/9] PM / OPP: reuse find_device_opp() instead of duplicating code Reuse find_device_opp() in opp_set_availability() instead of duplicating code. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index fa065d6e1731..525ffb202d77 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -597,7 +597,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_remove); static int opp_set_availability(struct device *dev, unsigned long freq, bool availability_req) { - struct device_opp *tmp_dev_opp, *dev_opp = ERR_PTR(-ENODEV); + struct device_opp *dev_opp; struct dev_pm_opp *new_opp, *tmp_opp, *opp = ERR_PTR(-ENODEV); int r = 0; @@ -611,12 +611,7 @@ static int opp_set_availability(struct device *dev, unsigned long freq, mutex_lock(&dev_opp_list_lock); /* Find the device_opp */ - list_for_each_entry(tmp_dev_opp, &dev_opp_list, node) { - if (dev == tmp_dev_opp->dev) { - dev_opp = tmp_dev_opp; - break; - } - } + dev_opp = find_device_opp(dev); if (IS_ERR(dev_opp)) { r = PTR_ERR(dev_opp); dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r); From 07cce74a7b259cb90029530e9549bf1d9a1b1c34 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Dec 2014 09:45:34 +0530 Subject: [PATCH 4/9] PM / OPP: handle allocation of device_opp in a separate routine Get the 'device_opp' allocation code into a separate routine to keep only the necessary part in dev_pm_opp_add_dynamic(). Also do s/sizeof(struct device_opp)/sizeof(*dev_opp) and remove the print message on kzalloc() failure as checkpatch warns for that. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index 525ffb202d77..1150b9d2e012 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -386,6 +386,27 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, } EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor); +static struct device_opp *add_device_opp(struct device *dev) +{ + struct device_opp *dev_opp; + + /* + * Allocate a new device OPP table. In the infrequent case where a new + * device is needed to be added, we pay this penalty. + */ + dev_opp = kzalloc(sizeof(*dev_opp), GFP_KERNEL); + if (!dev_opp) + return NULL; + + dev_opp->dev = dev; + srcu_init_notifier_head(&dev_opp->srcu_head); + INIT_LIST_HEAD(&dev_opp->opp_list); + + /* Secure the device list modification */ + list_add_rcu(&dev_opp->node, &dev_opp_list); + return dev_opp; +} + static int dev_pm_opp_add_dynamic(struct device *dev, unsigned long freq, unsigned long u_volt, bool dynamic) { @@ -412,27 +433,13 @@ static int dev_pm_opp_add_dynamic(struct device *dev, unsigned long freq, /* Check for existing list for 'dev' */ dev_opp = find_device_opp(dev); if (IS_ERR(dev_opp)) { - /* - * Allocate a new device OPP table. In the infrequent case - * where a new device is needed to be added, we pay this - * penalty. - */ - dev_opp = kzalloc(sizeof(struct device_opp), GFP_KERNEL); + dev_opp = add_device_opp(dev); if (!dev_opp) { mutex_unlock(&dev_opp_list_lock); kfree(new_opp); - dev_warn(dev, - "%s: Unable to create device OPP structure\n", - __func__); return -ENOMEM; } - dev_opp->dev = dev; - srcu_init_notifier_head(&dev_opp->srcu_head); - INIT_LIST_HEAD(&dev_opp->opp_list); - - /* Secure the device list modification */ - list_add_rcu(&dev_opp->node, &dev_opp_list); head = &dev_opp->opp_list; goto list_add; } From 6ce4184d0308888dd6ac2b6ab5f8ec0b2006092e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Dec 2014 09:45:35 +0530 Subject: [PATCH 5/9] PM / OPP: do error handling at the bottom of dev_pm_opp_add_dynamic() This makes it less error prone and moves common resource deallocation at a single place. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index 1150b9d2e012..d24dd614a0bd 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -413,6 +413,7 @@ static int dev_pm_opp_add_dynamic(struct device *dev, unsigned long freq, struct device_opp *dev_opp = NULL; struct dev_pm_opp *opp, *new_opp; struct list_head *head; + int ret; /* allocate new OPP node */ new_opp = kzalloc(sizeof(*new_opp), GFP_KERNEL); @@ -435,9 +436,8 @@ static int dev_pm_opp_add_dynamic(struct device *dev, unsigned long freq, if (IS_ERR(dev_opp)) { dev_opp = add_device_opp(dev); if (!dev_opp) { - mutex_unlock(&dev_opp_list_lock); - kfree(new_opp); - return -ENOMEM; + ret = -ENOMEM; + goto free_opp; } head = &dev_opp->opp_list; @@ -458,15 +458,13 @@ static int dev_pm_opp_add_dynamic(struct device *dev, unsigned long freq, /* Duplicate OPPs ? */ if (new_opp->rate == opp->rate) { - int ret = opp->available && new_opp->u_volt == opp->u_volt ? + ret = opp->available && new_opp->u_volt == opp->u_volt ? 0 : -EEXIST; dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n", __func__, opp->rate, opp->u_volt, opp->available, new_opp->rate, new_opp->u_volt, new_opp->available); - mutex_unlock(&dev_opp_list_lock); - kfree(new_opp); - return ret; + goto free_opp; } list_add: @@ -480,6 +478,11 @@ list_add: */ srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp); return 0; + +free_opp: + mutex_unlock(&dev_opp_list_lock); + kfree(new_opp); + return ret; } /** From aa4ea34da9e7a2bc0fb98f5add3e4e52872b7d45 Mon Sep 17 00:00:00 2001 From: Ethan Zhao Date: Tue, 9 Dec 2014 10:43:19 +0900 Subject: [PATCH 6/9] intel_pstate: add kernel parameter to force loading To force loading on Oracle Sun X86 servers, provide one kernel command line parameter intel_pstate = force For those who are aware of the risk of no power capping capabily working and try to get better performance with this driver. Signed-off-by: Ethan Zhao Tested-by: Alexey Kodanev Reviewed-by: Linda Knippers Acked-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 9 +++++++++ drivers/cpufreq/intel_pstate.c | 6 +++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 5fdf714f40a0..d006bb22f3ca 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1446,6 +1446,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. disable Do not enable intel_pstate as the default scaling driver for the supported processors + force + Enable intel_pstate on systems that prohibit it by default + in favor of acpi-cpufreq. Forcing the intel_pstate driver + instead of acpi-cpufreq may disable platform features, such + as thermal controls and power capping, that rely on ACPI + P-States information being indicated to OSPM and therefore + should be used with caution. This option does not work with + processors that aren't supported by the intel_pstate driver + or on platforms that use pcc-cpufreq instead of acpi-cpufreq. no_hwp Do not enable hardware P state control (HWP) if available. diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1405b393c93d..0e841eecb743 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -948,6 +948,7 @@ static struct cpufreq_driver intel_pstate_driver = { static int __initdata no_load; static int __initdata no_hwp; +static unsigned int force_load; static int intel_pstate_msrs_not_valid(void) { @@ -1094,7 +1095,8 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void) case PSS: return intel_pstate_no_acpi_pss(); case PPC: - return intel_pstate_has_acpi_ppc(); + return intel_pstate_has_acpi_ppc() && + (!force_load); } } @@ -1175,6 +1177,8 @@ static int __init intel_pstate_setup(char *str) no_load = 1; if (!strcmp(str, "no_hwp")) no_hwp = 1; + if (!strcmp(str, "force")) + force_load = 1; return 0; } early_param("intel_pstate", intel_pstate_setup); From e0d4c8f80804000eadc106bc5167b96fc6231d98 Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Wed, 10 Dec 2014 12:39:38 -0800 Subject: [PATCH 7/9] intel_pstate: Add a few comments Add a few comments in the code which calculates busyness to clarify parts of the algorithm. Signed-off-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 0e841eecb743..742eefba12c2 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -199,7 +199,14 @@ static signed int pid_calc(struct _pid *pid, int32_t busy) pid->integral += fp_error; - /* limit the integral term */ + /* + * We limit the integral here so that it will never + * get higher than 30. This prevents it from becoming + * too large an input over long periods of time and allows + * it to get factored out sooner. + * + * The value of 30 was chosen through experimentation. + */ integral_limit = int_tofp(30); if (pid->integral > integral_limit) pid->integral = integral_limit; @@ -616,6 +623,11 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) if (limits.no_turbo || limits.turbo_disabled) max_perf = cpu->pstate.max_pstate; + /* + * performance can be limited by user through sysfs, by cpufreq + * policy, or by cpu specific default values determined through + * experimentation. + */ max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits.max_perf)); *max = clamp_t(int, max_perf_adj, cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); @@ -717,11 +729,29 @@ static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu) u32 duration_us; u32 sample_time; + /* + * core_busy is the ratio of actual performance to max + * max_pstate is the max non turbo pstate available + * current_pstate was the pstate that was requested during + * the last sample period. + * + * We normalize core_busy, which was our actual percent + * performance to what we requested during the last sample + * period. The result will be a percentage of busy at a + * specified pstate. + */ core_busy = cpu->sample.core_pct_busy; max_pstate = int_tofp(cpu->pstate.max_pstate); current_pstate = int_tofp(cpu->pstate.current_pstate); core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate)); + /* + * Since we have a deferred timer, it will not fire unless + * we are in C0. So, determine if the actual elapsed time + * is significantly greater (3x) than our sample interval. If it + * is, then we were idle for a long enough period of time + * to adjust our busyness. + */ sample_time = pid_params.sample_rate_ms * USEC_PER_MSEC; duration_us = (u32) ktime_us_delta(cpu->sample.time, cpu->last_sample_time); From 2a813f1aaaf00a7eb65bef8da2fe9fcec0aabaaa Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Sun, 14 Dec 2014 09:06:37 -0500 Subject: [PATCH 8/9] Revert "tools: cpupower: fix return checks for sysfs_get_idlestate_count()" This reverts commit 16b7c275c055cc36218404b5d147be7f76575087. My previous commit 16b7c275c055 ("tools: cpupower: fix return checks for sysfs_get_idlestate_count()") was not correct. After looking at the changelog for cpupower I noticed that Thomas had changed the return of sysfs_get_idlestate_count() to an unsigned int to simplify the code. The problem is really that both he (in his original change) and I (in my new change) missed the obvious that sysfs_get_idlestate_count() can't return -ENODEV. It should just return 0 for "no c-states". Fixes: 16b7c275c055 (tools: cpupower: fix return checks for ...) Signed-off-by: Prarit Bhargava Signed-off-by: Rafael J. Wysocki --- tools/power/cpupower/utils/cpuidle-info.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/cpupower/utils/cpuidle-info.c b/tools/power/cpupower/utils/cpuidle-info.c index 458d69b444ad..75e66de7e7a7 100644 --- a/tools/power/cpupower/utils/cpuidle-info.c +++ b/tools/power/cpupower/utils/cpuidle-info.c @@ -22,13 +22,13 @@ static void cpuidle_cpu_output(unsigned int cpu, int verbose) { - int idlestates, idlestate; + unsigned int idlestates, idlestate; char *tmp; printf(_ ("Analyzing CPU %d:\n"), cpu); idlestates = sysfs_get_idlestate_count(cpu); - if (idlestates < 1) { + if (idlestates == 0) { printf(_("CPU %u: No idle states\n"), cpu); return; } @@ -100,10 +100,10 @@ static void cpuidle_general_output(void) static void proc_cpuidle_cpu_output(unsigned int cpu) { long max_allowed_cstate = 2000000000; - int cstate, cstates; + unsigned int cstate, cstates; cstates = sysfs_get_idlestate_count(cpu); - if (cstates < 1) { + if (cstates == 0) { printf(_("CPU %u: No C-states info\n"), cpu); return; } From 7c1ac18dc02c105a199167ecc495944bd0e14d5a Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Mon, 15 Dec 2014 15:58:24 -0800 Subject: [PATCH 9/9] MAINTAINERS: add entry for intel_pstate Add entry for intel_pstate. Signed-off-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0ff630de8a6d..888d8bd8b5b7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4837,6 +4837,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git S: Supported F: drivers/idle/intel_idle.c +INTEL PSTATE DRIVER +M: Kristen Carlson Accardi +L: linux-pm@vger.kernel.org +S: Supported +F: drivers/cpufreq/intel_pstate.c + INTEL FRAMEBUFFER DRIVER (excluding 810 and 815) M: Maik Broemme L: linux-fbdev@vger.kernel.org