From bb6ab52f2befe1fb29ac198f27d8a6aadf510f81 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 31 Mar 2016 17:42:15 +0200 Subject: [PATCH 01/28] intel_pstate: Do not set utilization update hook too early The utilization update hook in the intel_pstate driver is set too early, as it only should be set after the policy has been fully initialized by the core. That may cause intel_pstate_update_util() to use incorrect data and put the CPUs into incorrect P-states as a result. To prevent that from happening, make intel_pstate_set_policy() set the utilization update hook instead of intel_pstate_init_cpu() so intel_pstate_update_util() only runs when all things have been initialized as appropriate. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 4b644526fd59..81057e48c4de 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1103,7 +1103,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum) intel_pstate_sample(cpu, 0); cpu->update_util.func = intel_pstate_update_util; - cpufreq_set_update_util_data(cpunum, &cpu->update_util); pr_debug("intel_pstate: controlling: cpu %d\n", cpunum); @@ -1122,18 +1121,29 @@ static unsigned int intel_pstate_get(unsigned int cpu_num) return get_avg_frequency(cpu); } +static void intel_pstate_set_update_util_hook(unsigned int cpu) +{ + cpufreq_set_update_util_data(cpu, &all_cpu_data[cpu]->update_util); +} + +static void intel_pstate_clear_update_util_hook(unsigned int cpu) +{ + cpufreq_set_update_util_data(cpu, NULL); + synchronize_sched(); +} + static int intel_pstate_set_policy(struct cpufreq_policy *policy) { if (!policy->cpuinfo.max_freq) return -ENODEV; + intel_pstate_clear_update_util_hook(policy->cpu); + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE && policy->max >= policy->cpuinfo.max_freq) { pr_debug("intel_pstate: set performance\n"); limits = &performance_limits; - if (hwp_active) - intel_pstate_hwp_set(policy->cpus); - return 0; + goto out; } pr_debug("intel_pstate: set powersave\n"); @@ -1163,6 +1173,9 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) limits->max_perf = div_fp(int_tofp(limits->max_perf_pct), int_tofp(100)); + out: + intel_pstate_set_update_util_hook(policy->cpu); + if (hwp_active) intel_pstate_hwp_set(policy->cpus); @@ -1187,8 +1200,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) pr_debug("intel_pstate: CPU %d exiting\n", cpu_num); - cpufreq_set_update_util_data(cpu_num, NULL); - synchronize_sched(); + intel_pstate_clear_update_util_hook(cpu_num); if (hwp_active) return; @@ -1455,8 +1467,7 @@ out: get_online_cpus(); for_each_online_cpu(cpu) { if (all_cpu_data[cpu]) { - cpufreq_set_update_util_data(cpu, NULL); - synchronize_sched(); + intel_pstate_clear_update_util_hook(cpu); kfree(all_cpu_data[cpu]); } } From febce40febcff3ccdb33f63456ffc4cfc61640c8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 2 Apr 2016 01:06:21 +0200 Subject: [PATCH 02/28] intel_pstate: Avoid extra invocation of intel_pstate_sample() The initialization of intel_pstate for a given CPU involves populating the fields of its struct cpudata that represent the previous sample, but currently that is done in a problematic way. Namely, intel_pstate_init_cpu() makes an extra call to intel_pstate_sample() so it reads the current register values that will be used to populate the "previous sample" record during the next invocation of intel_pstate_sample(). However, after commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) that doesn't work for last_sample_time, because the time value is passed to intel_pstate_sample() as an argument now. Passing 0 to it from intel_pstate_init_cpu() is problematic, because that causes cpu->last_sample_time == 0 to be visible in get_target_pstate_use_performance() (and hence the extra cpu->last_sample_time > 0 check in there) and effectively allows the first invocation of intel_pstate_sample() from intel_pstate_update_util() to happen immediately after the initialization which may lead to a significant "turn on" effect in the governor algorithm. To mitigate that issue, rework the initialization to avoid the extra intel_pstate_sample() call from intel_pstate_init_cpu(). Instead, make intel_pstate_sample() return false if it has been called with cpu->sample.time equal to zero, which will make intel_pstate_update_util() skip the sample in that case, and reset cpu->sample.time from intel_pstate_set_update_util_hook() to make the algorithm start properly every time the hook is set. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 81057e48c4de..9ae159631f52 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -910,7 +910,14 @@ static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) cpu->prev_aperf = aperf; cpu->prev_mperf = mperf; cpu->prev_tsc = tsc; - return true; + /* + * First time this function is invoked in a given cycle, all of the + * previous sample data fields are equal to zero or stale and they must + * be populated with meaningful numbers for things to work, so assume + * that sample.time will always be reset before setting the utilization + * update hook and make the caller skip the sample then. + */ + return !!cpu->last_sample_time; } static inline int32_t get_avg_frequency(struct cpudata *cpu) @@ -984,8 +991,7 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) * enough period of time to adjust our busyness. */ duration_ns = cpu->sample.time - cpu->last_sample_time; - if ((s64)duration_ns > pid_params.sample_rate_ns * 3 - && cpu->last_sample_time > 0) { + if ((s64)duration_ns > pid_params.sample_rate_ns * 3) { sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns), int_tofp(duration_ns)); core_busy = mul_fp(core_busy, sample_ratio); @@ -1100,7 +1106,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum) intel_pstate_get_cpu_pstates(cpu); intel_pstate_busy_pid_reset(cpu); - intel_pstate_sample(cpu, 0); cpu->update_util.func = intel_pstate_update_util; @@ -1121,9 +1126,13 @@ static unsigned int intel_pstate_get(unsigned int cpu_num) return get_avg_frequency(cpu); } -static void intel_pstate_set_update_util_hook(unsigned int cpu) +static void intel_pstate_set_update_util_hook(unsigned int cpu_num) { - cpufreq_set_update_util_data(cpu, &all_cpu_data[cpu]->update_util); + struct cpudata *cpu = all_cpu_data[cpu_num]; + + /* Prevent intel_pstate_update_util() from using stale data. */ + cpu->sample.time = 0; + cpufreq_set_update_util_data(cpu_num, &cpu->update_util); } static void intel_pstate_clear_update_util_hook(unsigned int cpu) From 30a3915385f124a6e3c81df4070f531d4f84299b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 3 Apr 2016 19:42:11 -0700 Subject: [PATCH 03/28] cpufreq: intel_pstate: fix inconsistency in setting policy limits When user sets performance policy using cpufreq interface, it is possible that because of policy->max limits, the actual performance is still limited. But the current implementation will silently switch the policy to powersave and start using powersave limits. If user modifies any limits using intel_pstate sysfs, this is actually changing powersave limits. The current implementation tracks limits under powersave and performance policy using two different variables. When policy->max is less than policy->cpuinfo.max_freq, only powersave limit variable is used. This fix causes the performance limits variable to be used always when the policy is performance. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9ae159631f52..c6c169a1b2fc 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1141,6 +1141,20 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu) synchronize_sched(); } +static void intel_pstate_set_performance_limits(struct perf_limits *limits) +{ + limits->no_turbo = 0; + limits->turbo_disabled = 0; + limits->max_perf_pct = 100; + limits->max_perf = int_tofp(1); + limits->min_perf_pct = 100; + limits->min_perf = int_tofp(1); + limits->max_policy_pct = 100; + limits->max_sysfs_pct = 100; + limits->min_policy_pct = 0; + limits->min_sysfs_pct = 0; +} + static int intel_pstate_set_policy(struct cpufreq_policy *policy) { if (!policy->cpuinfo.max_freq) @@ -1148,15 +1162,18 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) intel_pstate_clear_update_util_hook(policy->cpu); - if (policy->policy == CPUFREQ_POLICY_PERFORMANCE && - policy->max >= policy->cpuinfo.max_freq) { - pr_debug("intel_pstate: set performance\n"); + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { limits = &performance_limits; - goto out; + if (policy->max >= policy->cpuinfo.max_freq) { + pr_debug("intel_pstate: set performance\n"); + intel_pstate_set_performance_limits(limits); + goto out; + } + } else { + pr_debug("intel_pstate: set powersave\n"); + limits = &powersave_limits; } - pr_debug("intel_pstate: set powersave\n"); - limits = &powersave_limits; limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq; limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100); limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100, From 13ad7701f9d0ab7806eb91f1fe1ca43d41b31fa2 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 3 Apr 2016 13:06:46 -0700 Subject: [PATCH 04/28] cpufreq: intel_pstate: Documenation for structures No code change. Only added kernel doc style comments for structures. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 135 +++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index c6c169a1b2fc..8b5a415ee14a 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -64,6 +64,25 @@ static inline int ceiling_fp(int32_t x) return ret; } +/** + * struct sample - Store performance sample + * @core_pct_busy: Ratio of APERF/MPERF in percent, which is actual + * performance during last sample period + * @busy_scaled: Scaled busy value which is used to calculate next + * P state. This can be different than core_pct_busy + * to account for cpu idle period + * @aperf: Difference of actual performance frequency clock count + * read from APERF MSR between last and current sample + * @mperf: Difference of maximum performance frequency clock count + * read from MPERF MSR between last and current sample + * @tsc: Difference of time stamp counter between last and + * current sample + * @freq: Effective frequency calculated from APERF/MPERF + * @time: Current time from scheduler + * + * This structure is used in the cpudata structure to store performance sample + * data for choosing next P State. + */ struct sample { int32_t core_pct_busy; int32_t busy_scaled; @@ -74,6 +93,20 @@ struct sample { u64 time; }; +/** + * struct pstate_data - Store P state data + * @current_pstate: Current requested P state + * @min_pstate: Min P state possible for this platform + * @max_pstate: Max P state possible for this platform + * @max_pstate_physical:This is physical Max P state for a processor + * This can be higher than the max_pstate which can + * be limited by platform thermal design power limits + * @scaling: Scaling factor to convert frequency to cpufreq + * frequency units + * @turbo_pstate: Max Turbo P state possible for this platform + * + * Stores the per cpu model P state limits and current P state. + */ struct pstate_data { int current_pstate; int min_pstate; @@ -83,6 +116,19 @@ struct pstate_data { int turbo_pstate; }; +/** + * struct vid_data - Stores voltage information data + * @min: VID data for this platform corresponding to + * the lowest P state + * @max: VID data corresponding to the highest P State. + * @turbo: VID data for turbo P state + * @ratio: Ratio of (vid max - vid min) / + * (max P state - Min P State) + * + * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling) + * This data is used in Atom platforms, where in addition to target P state, + * the voltage data needs to be specified to select next P State. + */ struct vid_data { int min; int max; @@ -90,6 +136,18 @@ struct vid_data { int32_t ratio; }; +/** + * struct _pid - Stores PID data + * @setpoint: Target set point for busyness or performance + * @integral: Storage for accumulated error values + * @p_gain: PID proportional gain + * @i_gain: PID integral gain + * @d_gain: PID derivative gain + * @deadband: PID deadband + * @last_err: Last error storage for integral part of PID calculation + * + * Stores PID coefficients and last error for PID controller. + */ struct _pid { int setpoint; int32_t integral; @@ -100,6 +158,23 @@ struct _pid { int32_t last_err; }; +/** + * struct cpudata - Per CPU instance data storage + * @cpu: CPU number for this instance data + * @update_util: CPUFreq utility callback information + * @pstate: Stores P state limits for this CPU + * @vid: Stores VID limits for this CPU + * @pid: Stores PID parameters for this CPU + * @last_sample_time: Last Sample time + * @prev_aperf: Last APERF value read from APERF MSR + * @prev_mperf: Last MPERF value read from MPERF MSR + * @prev_tsc: Last timestamp counter (TSC) value + * @prev_cummulative_iowait: IO Wait time difference from last and + * current sample + * @sample: Storage for storing last Sample data + * + * This structure stores per CPU instance data for all CPUs. + */ struct cpudata { int cpu; @@ -118,6 +193,19 @@ struct cpudata { }; static struct cpudata **all_cpu_data; + +/** + * struct pid_adjust_policy - Stores static PID configuration data + * @sample_rate_ms: PID calculation sample rate in ms + * @sample_rate_ns: Sample rate calculation in ns + * @deadband: PID deadband + * @setpoint: PID Setpoint + * @p_gain_pct: PID proportional gain + * @i_gain_pct: PID integral gain + * @d_gain_pct: PID derivative gain + * + * Stores per CPU model static PID configuration data. + */ struct pstate_adjust_policy { int sample_rate_ms; s64 sample_rate_ns; @@ -128,6 +216,20 @@ struct pstate_adjust_policy { int i_gain_pct; }; +/** + * struct pstate_funcs - Per CPU model specific callbacks + * @get_max: Callback to get maximum non turbo effective P state + * @get_max_physical: Callback to get maximum non turbo physical P state + * @get_min: Callback to get minimum P state + * @get_turbo: Callback to get turbo P state + * @get_scaling: Callback to get frequency scaling factor + * @get_val: Callback to convert P state to actual MSR write value + * @get_vid: Callback to get VID data for Atom platforms + * @get_target_pstate: Callback to a function to calculate next P state to use + * + * Core and Atom CPU models have different way to get P State limits. This + * structure is used to store those callbacks. + */ struct pstate_funcs { int (*get_max)(void); int (*get_max_physical)(void); @@ -139,6 +241,11 @@ struct pstate_funcs { int32_t (*get_target_pstate)(struct cpudata *); }; +/** + * struct cpu_defaults- Per CPU model default config data + * @pid_policy: PID config data + * @funcs: Callback function data + */ struct cpu_defaults { struct pstate_adjust_policy pid_policy; struct pstate_funcs funcs; @@ -151,6 +258,34 @@ static struct pstate_adjust_policy pid_params; static struct pstate_funcs pstate_funcs; static int hwp_active; + +/** + * struct perf_limits - Store user and policy limits + * @no_turbo: User requested turbo state from intel_pstate sysfs + * @turbo_disabled: Platform turbo status either from msr + * MSR_IA32_MISC_ENABLE or when maximum available pstate + * matches the maximum turbo pstate + * @max_perf_pct: Effective maximum performance limit in percentage, this + * is minimum of either limits enforced by cpufreq policy + * or limits from user set limits via intel_pstate sysfs + * @min_perf_pct: Effective minimum performance limit in percentage, this + * is maximum of either limits enforced by cpufreq policy + * or limits from user set limits via intel_pstate sysfs + * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct + * This value is used to limit max pstate + * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct + * This value is used to limit min pstate + * @max_policy_pct: The maximum performance in percentage enforced by + * cpufreq setpolicy interface + * @max_sysfs_pct: The maximum performance in percentage enforced by + * intel pstate sysfs interface + * @min_policy_pct: The minimum performance in percentage enforced by + * cpufreq setpolicy interface + * @min_sysfs_pct: The minimum performance in percentage enforced by + * intel pstate sysfs interface + * + * Storage for user and policy defined limits. + */ struct perf_limits { int no_turbo; int turbo_disabled; From b318556479cc923970a79d6c2311138581c0db83 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 24 Mar 2016 09:51:57 +0530 Subject: [PATCH 05/28] cpufreq: dt: Drop stale comment The comment in file header doesn't hold true anymore, drop it. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index f951f911786e..5f8dbe640a20 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -4,9 +4,6 @@ * Copyright (C) 2014 Linaro. * Viresh Kumar * - * The OPP code in function set_target() is reused from - * drivers/cpufreq/omap-cpufreq.c - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. From 462d8083fb2da951ad6edb9e8000c1f2578d27a9 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 26 Mar 2016 05:03:38 -0700 Subject: [PATCH 06/28] powercap: intel_rapl: Add missing Haswell model Added missing model 0x46. Tested-and-reported-by: Piotr Maksymiuk Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index cdfd01f0adb8..8fad0a7044d3 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -1091,6 +1091,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { RAPL_CPU(0x3f, rapl_defaults_hsw_server),/* Haswell servers */ RAPL_CPU(0x4f, rapl_defaults_hsw_server),/* Broadwell servers */ RAPL_CPU(0x45, rapl_defaults_core),/* Haswell ULT */ + RAPL_CPU(0x46, rapl_defaults_core),/* Haswell */ RAPL_CPU(0x47, rapl_defaults_core),/* Broadwell-H */ RAPL_CPU(0x4E, rapl_defaults_core),/* Skylake */ RAPL_CPU(0x4C, rapl_defaults_cht),/* Braswell/Cherryview */ From 8fd2910e31155d8562180cc8580bc27e3be4f4bf Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 3 Apr 2016 12:23:07 +0900 Subject: [PATCH 07/28] PM / runtime: Document steps for device removal Put a reminder that during device removal drivers should revert all PM runtime changes from the probe. Signed-off-by: Krzysztof Kozlowski Acked-by: Alan Stern Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 7328cf85236c..1fd1fbe9ce95 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt @@ -586,6 +586,10 @@ drivers to make their ->remove() callbacks avoid races with runtime PM directly, but also it allows of more flexibility in the handling of devices during the removal of their drivers. +Drivers in ->remove() callback should undo the runtime PM changes done +in ->probe(). Usually this means calling pm_runtime_disable(), +pm_runtime_dont_use_autosuspend() etc. + The user space can effectively disallow the driver of the device to power manage it at run time by changing the value of its /sys/devices/.../power/control attribute to "on", which causes pm_runtime_forbid() to be called. In principle, From 169b38373fd4a7d47cac0e5fdeeae90b8c076f32 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Wed, 6 Apr 2016 12:49:24 -0500 Subject: [PATCH 08/28] mailbox: pcc: Don't access an unmapped memory address space The acpi_pcc_probe() may end up accessing memory outside of the PCCT table space causing the kernel panic(). Increment the pcct_entry pointer after parsing 'HW-reduced Communications Subspace' to fix the problem. This change also enables the parsing of subtable at index 0. Signed-off-by: Shanker Donthineni Acked-by: Ashwin Chaugule Signed-off-by: Rafael J. Wysocki --- drivers/mailbox/pcc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c index ac11a9b395d3..9d3955e6ab44 100644 --- a/drivers/mailbox/pcc.c +++ b/drivers/mailbox/pcc.c @@ -367,8 +367,6 @@ static int __init acpi_pcc_probe(void) struct acpi_generic_address *db_reg; struct acpi_pcct_hw_reduced *pcct_ss; pcc_mbox_channels[i].con_priv = pcct_entry; - pcct_entry = (struct acpi_subtable_header *) - ((unsigned long) pcct_entry + pcct_entry->length); /* If doorbell is in system memory cache the virt address */ pcct_ss = (struct acpi_pcct_hw_reduced *)pcct_entry; @@ -376,6 +374,8 @@ static int __init acpi_pcc_probe(void) if (db_reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) pcc_doorbell_vaddr[i] = acpi_os_ioremap(db_reg->address, db_reg->bit_width/8); + pcct_entry = (struct acpi_subtable_header *) + ((unsigned long) pcct_entry + pcct_entry->length); } pcc_mbox_ctrl.num_chans = count; From f70415496d5ddf06fe7e0a22250d60bab2b2d7cc Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:48 -0400 Subject: [PATCH 09/28] intel_idle: remove useless return from void function. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ba947df5a8c7..9d5ed3264135 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1260,8 +1260,6 @@ static void __exit intel_idle_exit(void) __unregister_cpu_notifier(&cpu_hotplug_notifier); cpu_notifier_register_done(); - - return; } module_init(intel_idle_init); From 5469c827d20ab013f43d4f5f94e101d0cf7afd2c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:49 -0400 Subject: [PATCH 10/28] intel_idle: Fix a helper function's return value. The function, intel_idle_cpuidle_driver_init, delivers no error codes at all. This patch changes the function to return 'void' instead of returning zero. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 9d5ed3264135..dfa90558babf 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1111,7 +1111,7 @@ static void intel_idle_state_table_update(void) * intel_idle_cpuidle_driver_init() * allocate, initialize cpuidle_states */ -static int __init intel_idle_cpuidle_driver_init(void) +static void __init intel_idle_cpuidle_driver_init(void) { int cstate; struct cpuidle_driver *drv = &intel_idle_driver; @@ -1173,8 +1173,6 @@ static int __init intel_idle_cpuidle_driver_init(void) if (icpu->disable_promotion_to_c1e) /* each-cpu is redundant */ on_each_cpu(c1e_promotion_disable, NULL, 1); - - return 0; } From 4a3dfb3fc0fb0fc9acd36c94b7145f9c9dd4d93a Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:50 -0400 Subject: [PATCH 11/28] intel_idle: Remove redundant initialization calls. The function, intel_idle_cpuidle_driver_init, makes calls on each CPU to auto_demotion_disable() and c1e_promotion_disable(). These calls are redundant, as intel_idle_cpu_init() does the same calls just a bit later on. They are also premature, as the driver registration may yet fail. This patch removes the redundant code. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index dfa90558babf..cb85c4c42ff9 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1163,16 +1163,10 @@ static void __init intel_idle_cpuidle_driver_init(void) drv->state_count += 1; } - if (icpu->auto_demotion_disable_flags) - on_each_cpu(auto_demotion_disable, NULL, 1); - if (icpu->byt_auto_demotion_disable_flag) { wrmsrl(MSR_CC6_DEMOTION_POLICY_CONFIG, 0); wrmsrl(MSR_MC6_DEMOTION_POLICY_CONFIG, 0); } - - if (icpu->disable_promotion_to_c1e) /* each-cpu is redundant */ - on_each_cpu(c1e_promotion_disable, NULL, 1); } From 51319918bcc31f901646fc66348d41cf74ee0566 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:51 -0400 Subject: [PATCH 12/28] intel_idle: Fix deallocation order on the driver exit path. In the module_exit() method, this driver first frees its per-CPU pointer, then unregisters a callback making use of the pointer. Furthermore, the function, intel_idle_cpuidle_devices_uninit, is racy against CPU hot plugging as it calls for_each_online_cpu(). This patch corrects the issues by unregistering first on the exit path while holding the hot plug lock. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cb85c4c42ff9..4a1de3dcbf85 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1242,16 +1242,16 @@ static int __init intel_idle_init(void) static void __exit intel_idle_exit(void) { - intel_idle_cpuidle_devices_uninit(); - cpuidle_unregister_driver(&intel_idle_driver); - cpu_notifier_register_begin(); if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) on_each_cpu(__setup_broadcast_timer, (void *)false, 1); __unregister_cpu_notifier(&cpu_hotplug_notifier); + intel_idle_cpuidle_devices_uninit(); cpu_notifier_register_done(); + + cpuidle_unregister_driver(&intel_idle_driver); } module_init(intel_idle_init); From e9df69ccd1322e87eee10f28036fad9e6c71f8dd Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:52 -0400 Subject: [PATCH 13/28] intel_idle: Fix dangling registration on error path. In the module_init() method, if the per-CPU allocation fails, then the active cpuidle registration is not cleaned up. This patch fixes the issue by attempting the allocation before registration, and then cleaning it up again on registration failure. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4a1de3dcbf85..5dd741ff4de3 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1210,19 +1210,20 @@ static int __init intel_idle_init(void) if (retval) return retval; + intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); + if (intel_idle_cpuidle_devices == NULL) + return -ENOMEM; + intel_idle_cpuidle_driver_init(); retval = cpuidle_register_driver(&intel_idle_driver); if (retval) { struct cpuidle_driver *drv = cpuidle_get_driver(); printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", drv ? drv->name : "none"); + free_percpu(intel_idle_cpuidle_devices); return retval; } - intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); - if (intel_idle_cpuidle_devices == NULL) - return -ENOMEM; - cpu_notifier_register_begin(); for_each_online_cpu(i) { From ca42489d9ee3262482717c83428e087322fdc39c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:53 -0400 Subject: [PATCH 14/28] intel_idle: Avoid a double free of the per-CPU data. The helper function, intel_idle_cpuidle_devices_uninit, frees the globally allocated per-CPU data. However, this function is invoked from the hot plug notifier callback at a time when freeing that data is not safe. If the call to cpuidle_register_driver() should fail (say, due to lack of memory), then the driver will free its per-CPU region. On the *next* CPU_ONLINE event, the driver will happily use the region again and even free it again if the failure repeats. This patch fixes the issue by moving the call to free_percpu() outside of the helper function at the two call sites that actually need to free the per-CPU data. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 5dd741ff4de3..0b568728867e 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1002,7 +1002,7 @@ static int __init intel_idle_probe(void) /* * intel_idle_cpuidle_devices_uninit() - * unregister, free cpuidle_devices + * Unregisters the cpuidle devices. */ static void intel_idle_cpuidle_devices_uninit(void) { @@ -1013,9 +1013,6 @@ static void intel_idle_cpuidle_devices_uninit(void) dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); cpuidle_unregister_device(dev); } - - free_percpu(intel_idle_cpuidle_devices); - return; } /* @@ -1231,6 +1228,7 @@ static int __init intel_idle_init(void) if (retval) { cpu_notifier_register_done(); cpuidle_unregister_driver(&intel_idle_driver); + free_percpu(intel_idle_cpuidle_devices); return retval; } } @@ -1253,6 +1251,7 @@ static void __exit intel_idle_exit(void) cpu_notifier_register_done(); cpuidle_unregister_driver(&intel_idle_driver); + free_percpu(intel_idle_cpuidle_devices); } module_init(intel_idle_init); From 2259a819a8d37e472f08c88bc0dd22194754adb4 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:54 -0400 Subject: [PATCH 15/28] intel_idle: Setup the timer broadcast only on successful driver load. This driver sets the broadcast tick quite early on during probe and does not clean up again in cast of failure. This patch moves the setup call after the registration, placing the on_each_cpu() calls within the global CPU lock region. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 0b568728867e..ab34cd8b933c 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -987,16 +987,9 @@ static int __init intel_idle_probe(void) icpu = (const struct idle_cpu *)id->driver_data; cpuidle_state_table = icpu->state_table; - if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ - lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; - else - on_each_cpu(__setup_broadcast_timer, (void *)true, 1); - pr_debug(PREFIX "v" INTEL_IDLE_VERSION " model 0x%X\n", boot_cpu_data.x86_model); - pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", - lapic_timer_reliable_states); return 0; } @@ -1234,8 +1227,16 @@ static int __init intel_idle_init(void) } __register_cpu_notifier(&cpu_hotplug_notifier); + if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ + lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; + else + on_each_cpu(__setup_broadcast_timer, (void *)true, 1); + cpu_notifier_register_done(); + pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", + lapic_timer_reliable_states); + return 0; } From b69ef2c099c3e5f11bd5c33a9530d6522f72c9aa Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:55 -0400 Subject: [PATCH 16/28] intel_idle: Don't overreact to a cpuidle registration failure. The helper function, intel_idle_cpu_init, registers one new device with the cpuidle layer. If the registration should fail, that function immediately calls intel_idle_cpuidle_devices_uninit() to unregister every last CPU's device. However, it makes no sense to do so, when called from the hot plug notifier callback. This patch moves the call to intel_idle_cpuidle_devices_uninit() outside of the helper function to the one call site that actually needs to perform the de-registrations. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ab34cd8b933c..4418cfa2a4fc 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1175,7 +1175,6 @@ static int intel_idle_cpu_init(int cpu) if (cpuidle_register_device(dev)) { pr_debug(PREFIX "cpuidle_register_device %d failed!\n", cpu); - intel_idle_cpuidle_devices_uninit(); return -EIO; } @@ -1219,6 +1218,7 @@ static int __init intel_idle_init(void) for_each_online_cpu(i) { retval = intel_idle_cpu_init(i); if (retval) { + intel_idle_cpuidle_devices_uninit(); cpu_notifier_register_done(); cpuidle_unregister_driver(&intel_idle_driver); free_percpu(intel_idle_cpuidle_devices); From 08820546e4c30c84d0a1f1a49df055e1719c07ea Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:56 -0400 Subject: [PATCH 17/28] intel_idle: Propagate hot plug errors. If a cpuidle registration error occurs during the hot plug notifier callback, we should really inform the hot plug machinery instead of just ignoring the error. This patch changes the callback to properly return on error. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4418cfa2a4fc..8420ba10a434 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -818,8 +818,11 @@ static int cpu_hotplug_notify(struct notifier_block *n, * driver in this case */ dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu); - if (!dev->registered) - intel_idle_cpu_init(hotcpu); + if (dev->registered) + break; + + if (intel_idle_cpu_init(hotcpu)) + return NOTIFY_BAD; break; } From 3e66a9ab53641a0f7a440e56f7b35bf5d77494b3 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:57 -0400 Subject: [PATCH 18/28] intel_idle: Clean up all registered devices on exit. This driver registers cpuidle devices when a CPU comes online, but it leaves the registrations in place when a CPU goes offline. The module exit code only unregisters the currently online CPUs, leaving the devices for offline CPUs dangling. This patch changes the driver to clean up all registrations on exit, even those from CPUs that are offline. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 8420ba10a434..862346bd053a 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1245,12 +1245,19 @@ static int __init intel_idle_init(void) static void __exit intel_idle_exit(void) { + struct cpuidle_device *dev; + int i; + cpu_notifier_register_begin(); if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) on_each_cpu(__setup_broadcast_timer, (void *)false, 1); __unregister_cpu_notifier(&cpu_hotplug_notifier); - intel_idle_cpuidle_devices_uninit(); + + for_each_possible_cpu(i) { + dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); + cpuidle_unregister_device(dev); + } cpu_notifier_register_done(); From f9e71657c2c0a8f1c50884ab45794be2854e158e Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:00:58 -0400 Subject: [PATCH 19/28] intel_idle: Add SKX support SKX is similar to BDX Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 862346bd053a..4677c3da03a0 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -660,6 +660,35 @@ static struct cpuidle_state skl_cstates[] = { .enter = NULL } }; +static struct cpuidle_state skx_cstates[] = { + { + .name = "C1-SKX", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-SKX", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-SKX", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, + .target_residency = 600, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + static struct cpuidle_state atom_cstates[] = { { .name = "C1E-ATM", @@ -907,6 +936,10 @@ static const struct idle_cpu idle_cpu_skl = { .disable_promotion_to_c1e = true, }; +static const struct idle_cpu idle_cpu_skx = { + .state_table = skx_cstates, + .disable_promotion_to_c1e = true, +}; static const struct idle_cpu idle_cpu_avn = { .state_table = avn_cstates, @@ -948,6 +981,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(0x56, idle_cpu_bdw), ICPU(0x4e, idle_cpu_skl), ICPU(0x5e, idle_cpu_skl), + ICPU(0x55, idle_cpu_skx), ICPU(0x57, idle_cpu_knl), {} }; From 3ce093d4de753d6c92cc09366e29d0618a62f542 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:00:59 -0400 Subject: [PATCH 20/28] intel_idle: Add KBL support KBL is similar to SKL Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4677c3da03a0..c6935de425fa 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -981,6 +981,8 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(0x56, idle_cpu_bdw), ICPU(0x4e, idle_cpu_skl), ICPU(0x5e, idle_cpu_skl), + ICPU(0x8e, idle_cpu_skl), + ICPU(0x9e, idle_cpu_skl), ICPU(0x55, idle_cpu_skx), ICPU(0x57, idle_cpu_knl), {} From 8ae7225591fd15aac89769cbebb3b5ecc8b12fe5 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:15:54 -0400 Subject: [PATCH 21/28] tools/power turbostat: SGX state should print only if --debug The CPUID.SGX bit was printed, even if --debug was used Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ee1551b6fa01..e2440df6029e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3187,7 +3187,7 @@ void process_cpuid() if (debug) decode_misc_enable_msr(); - if (max_level >= 0x7) { + if (max_level >= 0x7 && debug) { int has_sgx; ecx = 0; From 5a63426e2a18775ed05b20e3bc90c68bacb1f68a Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:15:55 -0400 Subject: [PATCH 22/28] tools/power turbostat: print IRTL MSRs Some processors use the Interrupt Response Time Limit (IRTL) MSR value to describe the maximum IRQ response time latency for deep package C-states. (Though others have the register, but do not use it) Lets print it out to give insight into the cases where it is used. IRTL begain in SNB, with PC3/PC6/PC7, and HSW added PC8/PC9/PC10. Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr-index.h | 8 ++++ tools/power/x86/turbostat/turbostat.c | 59 +++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3fec311f7b9a..7b27e3241e5a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -162,6 +162,14 @@ #define MSR_PKG_C9_RESIDENCY 0x00000631 #define MSR_PKG_C10_RESIDENCY 0x00000632 +/* Interrupt Response Limit */ +#define MSR_PKGC3_IRTL 0x0000060a +#define MSR_PKGC6_IRTL 0x0000060b +#define MSR_PKGC7_IRTL 0x0000060c +#define MSR_PKGC8_IRTL 0x00000633 +#define MSR_PKGC9_IRTL 0x00000634 +#define MSR_PKGC10_IRTL 0x00000635 + /* Run Time Average Power Limiting (RAPL) Interface */ #define MSR_RAPL_POWER_UNIT 0x00000606 diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e2440df6029e..2f60fdbf9213 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -66,6 +66,8 @@ unsigned int do_slm_cstates; unsigned int use_c1_residency_msr; unsigned int has_aperf; unsigned int has_epb; +unsigned int do_irtl_snb; +unsigned int do_irtl_hsw; unsigned int units = 1000000; /* MHz etc */ unsigned int genuine_intel; unsigned int has_invariant_tsc; @@ -1579,6 +1581,47 @@ dump_config_tdp(void) fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1); fprintf(outf, ")\n"); } + +unsigned int irtl_time_units[] = {1, 32, 1024, 32768, 1048576, 33554432, 0, 0 }; + +void print_irtl(void) +{ + unsigned long long msr; + + get_msr(base_cpu, MSR_PKGC3_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + + get_msr(base_cpu, MSR_PKGC6_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + + get_msr(base_cpu, MSR_PKGC7_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + + if (!do_irtl_hsw) + return; + + get_msr(base_cpu, MSR_PKGC8_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + + get_msr(base_cpu, MSR_PKGC9_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + + get_msr(base_cpu, MSR_PKGC10_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + +} void free_fd_percpu(void) { int i; @@ -2879,9 +2922,14 @@ int has_snb_msrs(unsigned int family, unsigned int model) /* * HSW adds support for additional MSRs: * - * MSR_PKG_C8_RESIDENCY 0x00000630 - * MSR_PKG_C9_RESIDENCY 0x00000631 - * MSR_PKG_C10_RESIDENCY 0x00000632 + * MSR_PKG_C8_RESIDENCY 0x00000630 + * MSR_PKG_C9_RESIDENCY 0x00000631 + * MSR_PKG_C10_RESIDENCY 0x00000632 + * + * MSR_PKGC8_IRTL 0x00000633 + * MSR_PKGC9_IRTL 0x00000634 + * MSR_PKGC10_IRTL 0x00000635 + * */ int has_hsw_msrs(unsigned int family, unsigned int model) { @@ -3254,11 +3302,13 @@ void process_cpuid() do_nhm_platform_info = do_nhm_cstates = do_smi = probe_nhm_msrs(family, model); do_snb_cstates = has_snb_msrs(family, model); + do_irtl_snb = has_snb_msrs(family, model); do_pc2 = do_snb_cstates && (pkg_cstate_limit >= PCL__2); do_pc3 = (pkg_cstate_limit >= PCL__3); do_pc6 = (pkg_cstate_limit >= PCL__6); do_pc7 = do_snb_cstates && (pkg_cstate_limit >= PCL__7); do_c8_c9_c10 = has_hsw_msrs(family, model); + do_irtl_hsw = has_hsw_msrs(family, model); do_skl_residency = has_skl_msrs(family, model); do_slm_cstates = is_slm(family, model); do_knl_cstates = is_knl(family, model); @@ -3564,6 +3614,9 @@ void turbostat_init() if (debug) for_all_cpus(print_thermal, ODD_COUNTERS); + + if (debug && do_irtl_snb) + print_irtl(); } int fork_it(char **argv) From e4085d543e256aff6606ba99ed257f7c06685f3b Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:15:56 -0400 Subject: [PATCH 23/28] tools/power turbostat: initial BXT support Broxton has a lot in common with SKL Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- tools/power/x86/turbostat/turbostat.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2f60fdbf9213..96d8eafdd380 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1298,6 +1298,7 @@ int hsw_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, int slv_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; int amt_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; int phi_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; +int bxt_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; static void @@ -2199,6 +2200,9 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case 0x57: /* PHI */ pkg_cstate_limits = phi_pkg_cstate_limits; break; + case 0x5C: /* BXT */ + pkg_cstate_limits = bxt_pkg_cstate_limits; + break; default: return 0; } @@ -2628,6 +2632,9 @@ void rapl_probe(unsigned int family, unsigned int model) case 0x47: /* BDW */ do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO; break; + case 0x5C: /* BXT */ + do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO; + break; case 0x4E: /* SKL */ case 0x5E: /* SKL */ do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; @@ -2914,6 +2921,7 @@ int has_snb_msrs(unsigned int family, unsigned int model) case 0x56: /* BDX-DE */ case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x5C: /* BXT */ return 1; } return 0; @@ -2941,6 +2949,7 @@ int has_hsw_msrs(unsigned int family, unsigned int model) case 0x3D: /* BDW */ case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x5C: /* BXT */ return 1; } return 0; From e8efbc80db5e824ce2382d5e65429b6b493e71e2 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:15:57 -0400 Subject: [PATCH 24/28] tools/power turbostat: decode BXT TSC frequency via CPUID Hard-code BXT ART to 19200MHz, so turbostat --debug can fully enumerate TSC: CPUID(0x15): eax_crystal: 3 ebx_tsc: 186 ecx_crystal_hz: 0 TSC: 1190 MHz (19200000 Hz * 186 / 3 / 1000000) Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- tools/power/x86/turbostat/turbostat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 96d8eafdd380..b15c84323b73 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3278,7 +3278,10 @@ void process_cpuid() switch(model) { case 0x4E: /* SKL */ case 0x5E: /* SKL */ - crystal_hz = 24000000; /* 24 MHz */ + crystal_hz = 24000000; /* 24.0 MHz */ + break; + case 0x5C: /* BXT */ + crystal_hz = 19200000; /* 19.2 MHz */ break; default: crystal_hz = 0; From ec53e594c65ab099ca784d62b6f4c191e3a4d7cc Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:15:58 -0400 Subject: [PATCH 25/28] tools/power turbostat: initial SKX support SKX has a lot in common with HSX Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- tools/power/x86/turbostat/turbostat.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b15c84323b73..3b64b6bdaf90 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2188,6 +2188,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case 0x56: /* BDX-DE */ case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x55: /* SKX */ pkg_cstate_limits = hsw_pkg_cstate_limits; break; case 0x37: /* BYT */ @@ -2295,6 +2296,7 @@ int has_config_tdp(unsigned int family, unsigned int model) case 0x56: /* BDX-DE */ case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x55: /* SKX */ case 0x57: /* Knights Landing */ return 1; @@ -2642,6 +2644,7 @@ void rapl_probe(unsigned int family, unsigned int model) case 0x3F: /* HSX */ case 0x4F: /* BDX */ case 0x56: /* BDX-DE */ + case 0x55: /* SKX */ case 0x57: /* KNL */ do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; break; @@ -2921,6 +2924,7 @@ int has_snb_msrs(unsigned int family, unsigned int model) case 0x56: /* BDX-DE */ case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x55: /* SKX */ case 0x5C: /* BXT */ return 1; } @@ -3280,6 +3284,9 @@ void process_cpuid() case 0x5E: /* SKL */ crystal_hz = 24000000; /* 24.0 MHz */ break; + case 0x55: /* SKX */ + crystal_hz = 25000000; /* 25.0 MHz */ + break; case 0x5C: /* BXT */ crystal_hz = 19200000; /* 19.2 MHz */ break; @@ -3694,7 +3701,7 @@ int get_and_dump_counters(void) } void print_version() { - fprintf(outf, "turbostat version 4.11 27 Feb 2016" + fprintf(outf, "turbostat version 4.12 5 Apr 2016" " - Len Brown \n"); } From cdc57272ea0a0e952c4609b56e157e4d0ec8e956 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:15:59 -0400 Subject: [PATCH 26/28] tools/power turbostat: initial KBL support KBL is similar to SKL Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- tools/power/x86/turbostat/turbostat.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 3b64b6bdaf90..277e0321e590 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2188,6 +2188,8 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case 0x56: /* BDX-DE */ case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x8E: /* KBL */ + case 0x9E: /* KBL */ case 0x55: /* SKX */ pkg_cstate_limits = hsw_pkg_cstate_limits; break; @@ -2296,6 +2298,8 @@ int has_config_tdp(unsigned int family, unsigned int model) case 0x56: /* BDX-DE */ case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x8E: /* KBL */ + case 0x9E: /* KBL */ case 0x55: /* SKX */ case 0x57: /* Knights Landing */ @@ -2639,6 +2643,8 @@ void rapl_probe(unsigned int family, unsigned int model) break; case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x8E: /* KBL */ + case 0x9E: /* KBL */ do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; break; case 0x3F: /* HSX */ @@ -2924,6 +2930,8 @@ int has_snb_msrs(unsigned int family, unsigned int model) case 0x56: /* BDX-DE */ case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x8E: /* KBL */ + case 0x9E: /* KBL */ case 0x55: /* SKX */ case 0x5C: /* BXT */ return 1; @@ -2953,6 +2961,8 @@ int has_hsw_msrs(unsigned int family, unsigned int model) case 0x3D: /* BDW */ case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x8E: /* KBL */ + case 0x9E: /* KBL */ case 0x5C: /* BXT */ return 1; } @@ -2975,6 +2985,8 @@ int has_skl_msrs(unsigned int family, unsigned int model) switch (model) { case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x8E: /* KBL */ + case 0x9E: /* KBL */ return 1; } return 0; @@ -3282,6 +3294,8 @@ void process_cpuid() switch(model) { case 0x4E: /* SKL */ case 0x5E: /* SKL */ + case 0x8E: /* KBL */ + case 0x9E: /* KBL */ crystal_hz = 24000000; /* 24.0 MHz */ break; case 0x55: /* SKX */ From 9185e988e9d5bb70b690362e84bb2e4a9d71f2c5 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:16:00 -0400 Subject: [PATCH 27/28] tools/power turbostat: work around RC6 counter wrap Sometimes the rc6 sysfs counter spontaneously resets, causing turbostat prints a very large number as it tries to calcuate % = 100 * (old - new) / interval When we see (old > new), print ***.**% instead of a bogus huge number. Note that this detection is not fool-proof, as the counter could reset several times and still result in new > old. Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- tools/power/x86/turbostat/turbostat.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 277e0321e590..d2ae87d148ed 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -189,7 +189,7 @@ struct pkg_data { unsigned long long pkg_any_core_c0; unsigned long long pkg_any_gfxe_c0; unsigned long long pkg_both_core_gfxe_c0; - unsigned long long gfx_rc6_ms; + long long gfx_rc6_ms; unsigned int gfx_mhz; unsigned int package_id; unsigned int energy_pkg; /* MSR_PKG_ENERGY_STATUS */ @@ -623,8 +623,14 @@ int format_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "%8d", p->pkg_temp_c); /* GFXrc6 */ - if (do_gfx_rc6_ms) - outp += sprintf(outp, "%8.2f", 100.0 * p->gfx_rc6_ms / 1000.0 / interval_float); + if (do_gfx_rc6_ms) { + if (p->gfx_rc6_ms == -1) { /* detect counter reset */ + outp += sprintf(outp, " ***.**"); + } else { + outp += sprintf(outp, "%8.2f", + p->gfx_rc6_ms / 10.0 / interval_float); + } + } /* GFXMHz */ if (do_gfx_mhz) @@ -768,7 +774,12 @@ delta_package(struct pkg_data *new, struct pkg_data *old) old->pc10 = new->pc10 - old->pc10; old->pkg_temp_c = new->pkg_temp_c; - old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms; + /* flag an error when rc6 counter resets/wraps */ + if (old->gfx_rc6_ms > new->gfx_rc6_ms) + old->gfx_rc6_ms = -1; + else + old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms; + old->gfx_mhz = new->gfx_mhz; DELTA_WRAP32(new->energy_pkg, old->energy_pkg); From 16669befb0772eac6a76057b7010ba5b419c0e78 Mon Sep 17 00:00:00 2001 From: "Strashko, Grygorii" Date: Wed, 6 Apr 2016 14:45:53 +0300 Subject: [PATCH 28/28] PM / wakeirq: fix wakeirq setting after wakup re-configuration from sysfs Now wakeirq stops working for device if wakeup option for this device will be reconfigured through sysfs, like: echo disabled > /sys/devices/platform/extcon_usb1/power/wakeup echo enabled > /sys/devices/platform/extcon_usb1/power/wakeup Once above set of commands is executed the device's wakeup_source opject will be recreated and dev->power.wakeup->wakeirq field will contain NULL. As result, device_wakeup_arm_wake_irqs() will not arm wakeirq for the affected device. Hece, lets try to fix it in the following way: check for dev->wakeirq field when device_wakeup_attach() is called and if !NULL re-attach wakeirq to the device Signed-off-by: Grygorii Strashko Acked-by: Tony Lindgren Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index a1e0b9ab847a..5fb7718f256c 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -246,6 +246,8 @@ static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws) return -EEXIST; } dev->power.wakeup = ws; + if (dev->power.wakeirq) + device_wakeup_attach_irq(dev, dev->power.wakeirq); spin_unlock_irq(&dev->power.lock); return 0; }