Merge branches 'pm-cpufreq' and 'pm-cpuidle'

* pm-cpufreq:
  cpufreq: governor: Always schedule work on the CPU running update
  cpufreq: Always update current frequency before startig governor
  cpufreq: Introduce cpufreq_update_current_freq()
  cpufreq: Introduce cpufreq_start_governor()
  cpufreq: powernv: Add sysfs attributes to show throttle stats
  cpufreq: acpi-cpufreq: make Intel/AMD MSR access, io port access static
  cpufreq: powernv: Define per_cpu chip pointer to optimize hot-path
  cpufreq: acpi-cpufreq: Clean up hot plug notifier callback
  intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
  cpufreq: Make cpufreq_quick_get() safe to call

* pm-cpuidle:
  intel_idle: Support for Intel Xeon Phi Processor x200 Product Family
  intel_idle: prevent SKL-H boot failure when C8+C9+C10 enabled
  cpuidle: menu: Fall back to polling if next timer event is near
  cpuidle: menu: use high confidence factors only when considering polling
This commit is contained in:
Rafael J. Wysocki 2016-03-25 00:57:22 +01:00
commit 33068b61f8
8 changed files with 412 additions and 165 deletions

View File

@ -271,3 +271,72 @@ Description: Parameters for the CPU cache attributes
- WriteBack: data is written only to the cache line and
the modified cache line is written to main
memory only when it is replaced
What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats
/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat
/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat
/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle
/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap
/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp
/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault
/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent
/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset
Date: March 2016
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
Description: POWERNV CPUFreq driver's frequency throttle stats directory and
attributes
'cpuX/cpufreq/throttle_stats' directory contains the CPU frequency
throttle stat attributes for the chip. The throttle stats of a cpu
is common across all the cpus belonging to a chip. Below are the
throttle attributes exported in the 'throttle_stats' directory:
- turbo_stat : This file gives the total number of times the max
frequency is throttled to lower frequency in turbo (at and above
nominal frequency) range of frequencies.
- sub_turbo_stat : This file gives the total number of times the
max frequency is throttled to lower frequency in sub-turbo(below
nominal frequency) range of frequencies.
- unthrottle : This file gives the total number of times the max
frequency is unthrottled after being throttled.
- powercap : This file gives the total number of times the max
frequency is throttled due to 'Power Capping'.
- overtemp : This file gives the total number of times the max
frequency is throttled due to 'CPU Over Temperature'.
- supply_fault : This file gives the total number of times the
max frequency is throttled due to 'Power Supply Failure'.
- overcurrent : This file gives the total number of times the
max frequency is throttled due to 'Overcurrent'.
- occ_reset : This file gives the total number of times the max
frequency is throttled due to 'OCC Reset'.
The sysfs attributes representing different throttle reasons like
powercap, overtemp, supply_fault, overcurrent and occ_reset map to
the reasons provided by OCC firmware for throttling the frequency.
What: /sys/devices/system/cpu/cpufreq/policyX/throttle_stats
/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/turbo_stat
/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/sub_turbo_stat
/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/unthrottle
/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/powercap
/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overtemp
/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/supply_fault
/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overcurrent
/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/occ_reset
Date: March 2016
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
Description: POWERNV CPUFreq driver's frequency throttle stats directory and
attributes
'policyX/throttle_stats' directory and all the attributes are same as
the /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory and
attributes which give the frequency throttle information of the chip.

View File

@ -245,7 +245,7 @@ static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
}
}
u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
static u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
{
u32 val, dummy;
@ -253,7 +253,7 @@ u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
return val;
}
void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
static void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
{
u32 lo, hi;
@ -262,7 +262,7 @@ void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
wrmsr(MSR_IA32_PERF_CTL, lo, hi);
}
u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
static u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
{
u32 val, dummy;
@ -270,12 +270,12 @@ u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
return val;
}
void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
static void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
{
wrmsr(MSR_AMD_PERF_CTL, val, 0);
}
u32 cpu_freq_read_io(struct acpi_pct_register *reg)
static u32 cpu_freq_read_io(struct acpi_pct_register *reg)
{
u32 val;
@ -283,7 +283,7 @@ u32 cpu_freq_read_io(struct acpi_pct_register *reg)
return val;
}
void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
static void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
{
acpi_os_write_port(reg->address, val, reg->bit_width);
}
@ -514,8 +514,10 @@ static int boost_notify(struct notifier_block *nb, unsigned long action,
*/
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask);
break;

View File

@ -76,6 +76,7 @@ static inline bool has_target(void)
/* internal prototypes */
static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
static int cpufreq_start_governor(struct cpufreq_policy *policy);
/**
* Two notifier lists: the "policy" list is involved in the
@ -964,10 +965,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
cpumask_set_cpu(cpu, policy->cpus);
if (has_target()) {
ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
if (!ret)
ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
ret = cpufreq_start_governor(policy);
if (ret)
pr_err("%s: Failed to start governor\n", __func__);
}
@ -1308,10 +1306,7 @@ static void cpufreq_offline(unsigned int cpu)
/* Start governor again for active policy */
if (!policy_is_inactive(policy)) {
if (has_target()) {
ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
if (!ret)
ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
ret = cpufreq_start_governor(policy);
if (ret)
pr_err("%s: Failed to start governor\n", __func__);
}
@ -1401,9 +1396,17 @@ unsigned int cpufreq_quick_get(unsigned int cpu)
{
struct cpufreq_policy *policy;
unsigned int ret_freq = 0;
unsigned long flags;
if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get)
return cpufreq_driver->get(cpu);
read_lock_irqsave(&cpufreq_driver_lock, flags);
if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) {
ret_freq = cpufreq_driver->get(cpu);
read_unlock_irqrestore(&cpufreq_driver_lock, flags);
return ret_freq;
}
read_unlock_irqrestore(&cpufreq_driver_lock, flags);
policy = cpufreq_cpu_get(cpu);
if (policy) {
@ -1484,6 +1487,24 @@ unsigned int cpufreq_get(unsigned int cpu)
}
EXPORT_SYMBOL(cpufreq_get);
static unsigned int cpufreq_update_current_freq(struct cpufreq_policy *policy)
{
unsigned int new_freq;
new_freq = cpufreq_driver->get(policy->cpu);
if (!new_freq)
return 0;
if (!policy->cur) {
pr_debug("cpufreq: Driver did not initialize current freq\n");
policy->cur = new_freq;
} else if (policy->cur != new_freq && has_target()) {
cpufreq_out_of_sync(policy, new_freq);
}
return new_freq;
}
static struct subsys_interface cpufreq_interface = {
.name = "cpufreq",
.subsys = &cpu_subsys,
@ -1583,9 +1604,7 @@ void cpufreq_resume(void)
policy);
} else {
down_write(&policy->rwsem);
ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
if (!ret)
cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
ret = cpufreq_start_governor(policy);
up_write(&policy->rwsem);
if (ret)
@ -1593,17 +1612,6 @@ void cpufreq_resume(void)
__func__, policy);
}
}
/*
* schedule call cpufreq_update_policy() for first-online CPU, as that
* wouldn't be hotplugged-out on suspend. It will verify that the
* current freq is in sync with what we believe it to be.
*/
policy = cpufreq_cpu_get_raw(cpumask_first(cpu_online_mask));
if (WARN_ON(!policy))
return;
schedule_work(&policy->update);
}
/**
@ -1927,6 +1935,17 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
return ret;
}
static int cpufreq_start_governor(struct cpufreq_policy *policy)
{
int ret;
if (cpufreq_driver->get && !cpufreq_driver->setpolicy)
cpufreq_update_current_freq(policy);
ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
return ret ? ret : cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
}
int cpufreq_register_governor(struct cpufreq_governor *governor)
{
int err;
@ -2063,8 +2082,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
return cpufreq_driver->setpolicy(new_policy);
}
if (new_policy->governor == policy->governor)
goto out;
if (new_policy->governor == policy->governor) {
pr_debug("cpufreq: governor limits update\n");
return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
}
pr_debug("governor switch\n");
@ -2092,10 +2113,11 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
policy->governor = new_policy->governor;
ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
if (!ret) {
ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
if (!ret)
goto out;
ret = cpufreq_start_governor(policy);
if (!ret) {
pr_debug("cpufreq: governor change\n");
return 0;
}
cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
}
@ -2106,14 +2128,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
if (cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
policy->governor = NULL;
else
cpufreq_governor(policy, CPUFREQ_GOV_START);
cpufreq_start_governor(policy);
}
return ret;
out:
pr_debug("governor: change or update limits\n");
return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
}
/**
@ -2144,19 +2162,11 @@ int cpufreq_update_policy(unsigned int cpu)
* -> ask driver for current freq and notify governors about a change
*/
if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
new_policy.cur = cpufreq_driver->get(cpu);
new_policy.cur = cpufreq_update_current_freq(policy);
if (WARN_ON(!new_policy.cur)) {
ret = -EIO;
goto unlock;
}
if (!policy->cur) {
pr_debug("Driver did not initialize current freq\n");
policy->cur = new_policy.cur;
} else {
if (policy->cur != new_policy.cur && has_target())
cpufreq_out_of_sync(policy, new_policy.cur);
}
}
ret = cpufreq_set_policy(policy, &new_policy);

View File

@ -329,7 +329,7 @@ static void dbs_irq_work(struct irq_work *irq_work)
struct policy_dbs_info *policy_dbs;
policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work);
schedule_work(&policy_dbs->work);
schedule_work_on(smp_processor_id(), &policy_dbs->work);
}
static void dbs_update_util_handler(struct update_util_data *data, u64 time,

View File

@ -134,7 +134,7 @@ struct pstate_funcs {
int (*get_min)(void);
int (*get_turbo)(void);
int (*get_scaling)(void);
void (*set)(struct cpudata*, int pstate);
u64 (*get_val)(struct cpudata*, int pstate);
void (*get_vid)(struct cpudata *);
int32_t (*get_target_pstate)(struct cpudata *);
};
@ -565,7 +565,7 @@ static int atom_get_turbo_pstate(void)
return value & 0x7F;
}
static void atom_set_pstate(struct cpudata *cpudata, int pstate)
static u64 atom_get_val(struct cpudata *cpudata, int pstate)
{
u64 val;
int32_t vid_fp;
@ -585,9 +585,7 @@ static void atom_set_pstate(struct cpudata *cpudata, int pstate)
if (pstate > cpudata->pstate.max_pstate)
vid = cpudata->vid.turbo;
val |= vid;
wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
return val | vid;
}
static int silvermont_get_scaling(void)
@ -711,7 +709,7 @@ static inline int core_get_scaling(void)
return 100000;
}
static void core_set_pstate(struct cpudata *cpudata, int pstate)
static u64 core_get_val(struct cpudata *cpudata, int pstate)
{
u64 val;
@ -719,7 +717,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate)
if (limits->no_turbo && !limits->turbo_disabled)
val |= (u64)1 << 32;
wrmsrl(MSR_IA32_PERF_CTL, val);
return val;
}
static int knl_get_turbo_pstate(void)
@ -750,7 +748,7 @@ static struct cpu_defaults core_params = {
.get_min = core_get_min_pstate,
.get_turbo = core_get_turbo_pstate,
.get_scaling = core_get_scaling,
.set = core_set_pstate,
.get_val = core_get_val,
.get_target_pstate = get_target_pstate_use_performance,
},
};
@ -769,7 +767,7 @@ static struct cpu_defaults silvermont_params = {
.get_max_physical = atom_get_max_pstate,
.get_min = atom_get_min_pstate,
.get_turbo = atom_get_turbo_pstate,
.set = atom_set_pstate,
.get_val = atom_get_val,
.get_scaling = silvermont_get_scaling,
.get_vid = atom_get_vid,
.get_target_pstate = get_target_pstate_use_cpu_load,
@ -790,7 +788,7 @@ static struct cpu_defaults airmont_params = {
.get_max_physical = atom_get_max_pstate,
.get_min = atom_get_min_pstate,
.get_turbo = atom_get_turbo_pstate,
.set = atom_set_pstate,
.get_val = atom_get_val,
.get_scaling = airmont_get_scaling,
.get_vid = atom_get_vid,
.get_target_pstate = get_target_pstate_use_cpu_load,
@ -812,7 +810,7 @@ static struct cpu_defaults knl_params = {
.get_min = core_get_min_pstate,
.get_turbo = knl_get_turbo_pstate,
.get_scaling = core_get_scaling,
.set = core_set_pstate,
.get_val = core_get_val,
.get_target_pstate = get_target_pstate_use_performance,
},
};
@ -839,25 +837,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
}
static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate, bool force)
static inline void intel_pstate_record_pstate(struct cpudata *cpu, int pstate)
{
int max_perf, min_perf;
if (force) {
update_turbo_state();
intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
pstate = clamp_t(int, pstate, min_perf, max_perf);
if (pstate == cpu->pstate.current_pstate)
return;
}
trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
cpu->pstate.current_pstate = pstate;
}
pstate_funcs.set(cpu, pstate);
static void intel_pstate_set_min_pstate(struct cpudata *cpu)
{
int pstate = cpu->pstate.min_pstate;
intel_pstate_record_pstate(cpu, pstate);
/*
* Generally, there is no guarantee that this code will always run on
* the CPU being updated, so force the register update to run on the
* right CPU.
*/
wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
pstate_funcs.get_val(cpu, pstate));
}
static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
@ -870,7 +867,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
if (pstate_funcs.get_vid)
pstate_funcs.get_vid(cpu);
intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false);
intel_pstate_set_min_pstate(cpu);
}
static inline void intel_pstate_calc_busy(struct cpudata *cpu)
@ -997,6 +995,21 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy);
}
static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
{
int max_perf, min_perf;
update_turbo_state();
intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
pstate = clamp_t(int, pstate, min_perf, max_perf);
if (pstate == cpu->pstate.current_pstate)
return;
intel_pstate_record_pstate(cpu, pstate);
wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
}
static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
{
int from, target_pstate;
@ -1006,7 +1019,7 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
target_pstate = pstate_funcs.get_target_pstate(cpu);
intel_pstate_set_pstate(cpu, target_pstate, true);
intel_pstate_update_pstate(cpu, target_pstate);
sample = &cpu->sample;
trace_pstate_sample(fp_toint(sample->core_pct_busy),
@ -1180,7 +1193,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
if (hwp_active)
return;
intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false);
intel_pstate_set_min_pstate(cpu);
}
static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
@ -1255,7 +1268,7 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs)
pstate_funcs.get_min = funcs->get_min;
pstate_funcs.get_turbo = funcs->get_turbo;
pstate_funcs.get_scaling = funcs->get_scaling;
pstate_funcs.set = funcs->set;
pstate_funcs.get_val = funcs->get_val;
pstate_funcs.get_vid = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;

View File

@ -44,7 +44,6 @@
static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
static bool rebooting, throttled, occ_reset;
static unsigned int *core_to_chip_map;
static const char * const throttle_reason[] = {
"No throttling",
@ -55,6 +54,16 @@ static const char * const throttle_reason[] = {
"OCC Reset"
};
enum throttle_reason_type {
NO_THROTTLE = 0,
POWERCAP,
CPU_OVERTEMP,
POWER_SUPPLY_FAILURE,
OVERCURRENT,
OCC_RESET_THROTTLE,
OCC_MAX_REASON
};
static struct chip {
unsigned int id;
bool throttled;
@ -62,9 +71,13 @@ static struct chip {
u8 throttle_reason;
cpumask_t mask;
struct work_struct throttle;
int throttle_turbo;
int throttle_sub_turbo;
int reason[OCC_MAX_REASON];
} *chips;
static int nr_chips;
static DEFINE_PER_CPU(struct chip *, chip_info);
/*
* Note: The set of pstates consists of contiguous integers, the
@ -196,6 +209,42 @@ static struct freq_attr *powernv_cpu_freq_attr[] = {
NULL,
};
#define throttle_attr(name, member) \
static ssize_t name##_show(struct cpufreq_policy *policy, char *buf) \
{ \
struct chip *chip = per_cpu(chip_info, policy->cpu); \
\
return sprintf(buf, "%u\n", chip->member); \
} \
\
static struct freq_attr throttle_attr_##name = __ATTR_RO(name) \
throttle_attr(unthrottle, reason[NO_THROTTLE]);
throttle_attr(powercap, reason[POWERCAP]);
throttle_attr(overtemp, reason[CPU_OVERTEMP]);
throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]);
throttle_attr(overcurrent, reason[OVERCURRENT]);
throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]);
throttle_attr(turbo_stat, throttle_turbo);
throttle_attr(sub_turbo_stat, throttle_sub_turbo);
static struct attribute *throttle_attrs[] = {
&throttle_attr_unthrottle.attr,
&throttle_attr_powercap.attr,
&throttle_attr_overtemp.attr,
&throttle_attr_supply_fault.attr,
&throttle_attr_overcurrent.attr,
&throttle_attr_occ_reset.attr,
&throttle_attr_turbo_stat.attr,
&throttle_attr_sub_turbo_stat.attr,
NULL,
};
static const struct attribute_group throttle_attr_grp = {
.name = "throttle_stats",
.attrs = throttle_attrs,
};
/* Helper routines */
/* Access helpers to power mgt SPR */
@ -324,34 +373,35 @@ static inline unsigned int get_nominal_index(void)
static void powernv_cpufreq_throttle_check(void *data)
{
struct chip *chip;
unsigned int cpu = smp_processor_id();
unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)];
unsigned long pmsr;
int pmsr_pmax, i;
int pmsr_pmax;
pmsr = get_pmspr(SPRN_PMSR);
for (i = 0; i < nr_chips; i++)
if (chips[i].id == chip_id)
break;
chip = this_cpu_read(chip_info);
/* Check for Pmax Capping */
pmsr_pmax = (s8)PMSR_MAX(pmsr);
if (pmsr_pmax != powernv_pstate_info.max) {
if (chips[i].throttled)
if (chip->throttled)
goto next;
chips[i].throttled = true;
if (pmsr_pmax < powernv_pstate_info.nominal)
chip->throttled = true;
if (pmsr_pmax < powernv_pstate_info.nominal) {
pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
cpu, chips[i].id, pmsr_pmax,
cpu, chip->id, pmsr_pmax,
powernv_pstate_info.nominal);
trace_powernv_throttle(chips[i].id,
throttle_reason[chips[i].throttle_reason],
chip->throttle_sub_turbo++;
} else {
chip->throttle_turbo++;
}
trace_powernv_throttle(chip->id,
throttle_reason[chip->throttle_reason],
pmsr_pmax);
} else if (chips[i].throttled) {
chips[i].throttled = false;
trace_powernv_throttle(chips[i].id,
throttle_reason[chips[i].throttle_reason],
} else if (chip->throttled) {
chip->throttled = false;
trace_powernv_throttle(chip->id,
throttle_reason[chip->throttle_reason],
pmsr_pmax);
}
@ -411,6 +461,21 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
for (i = 0; i < threads_per_core; i++)
cpumask_set_cpu(base + i, policy->cpus);
if (!policy->driver_data) {
int ret;
ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
if (ret) {
pr_info("Failed to create throttle stats directory for cpu %d\n",
policy->cpu);
return ret;
}
/*
* policy->driver_data is used as a flag for one-time
* creation of throttle sysfs files.
*/
policy->driver_data = policy;
}
return cpufreq_table_validate_and_show(policy, powernv_freqs);
}
@ -517,8 +582,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
break;
if (omsg.throttle_status >= 0 &&
omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS)
omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
chips[i].throttle_reason = omsg.throttle_status;
chips[i].reason[omsg.throttle_status]++;
}
if (!omsg.throttle_status)
chips[i].restore = true;
@ -558,47 +625,34 @@ static int init_chip_info(void)
unsigned int chip[256];
unsigned int cpu, i;
unsigned int prev_chip_id = UINT_MAX;
cpumask_t cpu_mask;
int ret = -ENOMEM;
core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int),
GFP_KERNEL);
if (!core_to_chip_map)
goto out;
cpumask_copy(&cpu_mask, cpu_possible_mask);
for_each_cpu(cpu, &cpu_mask) {
for_each_possible_cpu(cpu) {
unsigned int id = cpu_to_chip_id(cpu);
if (prev_chip_id != id) {
prev_chip_id = id;
chip[nr_chips++] = id;
}
core_to_chip_map[cpu_core_index_of_thread(cpu)] = id;
cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu));
}
chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
if (!chips)
goto free_chip_map;
return -ENOMEM;
for (i = 0; i < nr_chips; i++) {
chips[i].id = chip[i];
cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
for_each_cpu(cpu, &chips[i].mask)
per_cpu(chip_info, cpu) = &chips[i];
}
return 0;
free_chip_map:
kfree(core_to_chip_map);
out:
return ret;
}
static inline void clean_chip_info(void)
{
kfree(chips);
kfree(core_to_chip_map);
}
static inline void unregister_all_notifiers(void)

View File

@ -196,7 +196,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
* of points is below a threshold. If it is... then use the
* average of these 8 points as the estimated value.
*/
static void get_typical_interval(struct menu_device *data)
static unsigned int get_typical_interval(struct menu_device *data)
{
int i, divisor;
unsigned int max, thresh, avg;
@ -253,9 +253,7 @@ again:
if (likely(variance <= U64_MAX/36)) {
if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
|| variance <= 400) {
if (data->next_timer_us > avg)
data->predicted_us = avg;
return;
return avg;
}
}
@ -269,7 +267,7 @@ again:
* with sporadic activity with a bunch of short pauses.
*/
if ((divisor * 4) <= INTERVALS * 3)
return;
return UINT_MAX;
thresh = max - 1;
goto again;
@ -286,6 +284,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
int i;
unsigned int interactivity_req;
unsigned int expected_interval;
unsigned long nr_iowaiters, cpu_load;
if (data->needs_update) {
@ -312,31 +311,42 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
data->correction_factor[data->bucket],
RESOLUTION * DECAY);
get_typical_interval(data);
expected_interval = get_typical_interval(data);
expected_interval = min(expected_interval, data->next_timer_us);
if (CPUIDLE_DRIVER_STATE_START > 0) {
struct cpuidle_state *s = &drv->states[CPUIDLE_DRIVER_STATE_START];
unsigned int polling_threshold;
/*
* We want to default to C1 (hlt), not to busy polling
* unless the timer is happening really really soon, or
* C1's exit latency exceeds the user configured limit.
*/
polling_threshold = max_t(unsigned int, 20, s->target_residency);
if (data->next_timer_us > polling_threshold &&
latency_req > s->exit_latency && !s->disabled &&
!dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable)
data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
else
data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
} else {
data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
}
/*
* Performance multiplier defines a minimum predicted idle
* duration / latency ratio. Adjust the latency limit if
* necessary.
* Use the lowest expected idle interval to pick the idle state.
*/
data->predicted_us = min(data->predicted_us, expected_interval);
/*
* Use the performance multiplier and the user-configurable
* latency_req to determine the maximum exit latency.
*/
interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
if (latency_req > interactivity_req)
latency_req = interactivity_req;
if (CPUIDLE_DRIVER_STATE_START > 0) {
data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
/*
* We want to default to C1 (hlt), not to busy polling
* unless the timer is happening really really soon.
*/
if (interactivity_req > 20 &&
!drv->states[CPUIDLE_DRIVER_STATE_START].disabled &&
dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0)
data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
} else {
data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
}
/*
* Find the idle state with the lowest power while satisfying
* our constraints.

View File

@ -65,7 +65,7 @@
#include <asm/mwait.h>
#include <asm/msr.h>
#define INTEL_IDLE_VERSION "0.4"
#define INTEL_IDLE_VERSION "0.4.1"
#define PREFIX "intel_idle: "
static struct cpuidle_driver intel_idle_driver = {
@ -716,6 +716,26 @@ static struct cpuidle_state avn_cstates[] = {
{
.enter = NULL }
};
static struct cpuidle_state knl_cstates[] = {
{
.name = "C1-KNL",
.desc = "MWAIT 0x00",
.flags = MWAIT2flg(0x00),
.exit_latency = 1,
.target_residency = 2,
.enter = &intel_idle,
.enter_freeze = intel_idle_freeze },
{
.name = "C6-KNL",
.desc = "MWAIT 0x10",
.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 120,
.target_residency = 500,
.enter = &intel_idle,
.enter_freeze = intel_idle_freeze },
{
.enter = NULL }
};
/**
* intel_idle
@ -890,6 +910,10 @@ static const struct idle_cpu idle_cpu_avn = {
.disable_promotion_to_c1e = true,
};
static const struct idle_cpu idle_cpu_knl = {
.state_table = knl_cstates,
};
#define ICPU(model, cpu) \
{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu }
@ -921,6 +945,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
ICPU(0x56, idle_cpu_bdw),
ICPU(0x4e, idle_cpu_skl),
ICPU(0x5e, idle_cpu_skl),
ICPU(0x57, idle_cpu_knl),
{}
};
MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids);
@ -993,37 +1018,93 @@ static void intel_idle_cpuidle_devices_uninit(void)
return;
}
/*
* ivt_idle_state_table_update(void)
*
* Tune IVT multi-socket targets
* Assumption: num_sockets == (max_package_num + 1)
*/
static void ivt_idle_state_table_update(void)
{
/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
int cpu, package_num, num_sockets = 1;
for_each_online_cpu(cpu) {
package_num = topology_physical_package_id(cpu);
if (package_num + 1 > num_sockets) {
num_sockets = package_num + 1;
if (num_sockets > 4) {
cpuidle_state_table = ivt_cstates_8s;
return;
}
}
}
if (num_sockets > 2)
cpuidle_state_table = ivt_cstates_4s;
/* else, 1 and 2 socket systems use default ivt_cstates */
}
/*
* sklh_idle_state_table_update(void)
*
* On SKL-H (model 0x5e) disable C8 and C9 if:
* C10 is enabled and SGX disabled
*/
static void sklh_idle_state_table_update(void)
{
unsigned long long msr;
unsigned int eax, ebx, ecx, edx;
/* if PC10 disabled via cmdline intel_idle.max_cstate=7 or shallower */
if (max_cstate <= 7)
return;
/* if PC10 not present in CPUID.MWAIT.EDX */
if ((mwait_substates & (0xF << 28)) == 0)
return;
rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr);
/* PC10 is not enabled in PKG C-state limit */
if ((msr & 0xF) != 8)
return;
ecx = 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
/* if SGX is present */
if (ebx & (1 << 2)) {
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
/* if SGX is enabled */
if (msr & (1 << 18))
return;
}
skl_cstates[5].disabled = 1; /* C8-SKL */
skl_cstates[6].disabled = 1; /* C9-SKL */
}
/*
* intel_idle_state_table_update()
*
* Update the default state_table for this CPU-id
*
* Currently used to access tuned IVT multi-socket targets
* Assumption: num_sockets == (max_package_num + 1)
*/
void intel_idle_state_table_update(void)
static void intel_idle_state_table_update(void)
{
/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
if (boot_cpu_data.x86_model == 0x3e) { /* IVT */
int cpu, package_num, num_sockets = 1;
switch (boot_cpu_data.x86_model) {
for_each_online_cpu(cpu) {
package_num = topology_physical_package_id(cpu);
if (package_num + 1 > num_sockets) {
num_sockets = package_num + 1;
if (num_sockets > 4) {
cpuidle_state_table = ivt_cstates_8s;
return;
}
}
}
if (num_sockets > 2)
cpuidle_state_table = ivt_cstates_4s;
/* else, 1 and 2 socket systems use default ivt_cstates */
case 0x3e: /* IVT */
ivt_idle_state_table_update();
break;
case 0x5e: /* SKL-H */
sklh_idle_state_table_update();
break;
}
return;
}
/*
@ -1063,6 +1144,14 @@ static int __init intel_idle_cpuidle_driver_init(void)
if (num_substates == 0)
continue;
/* if state marked as disabled, skip it */
if (cpuidle_state_table[cstate].disabled != 0) {
pr_debug(PREFIX "state %s is disabled",
cpuidle_state_table[cstate].name);
continue;
}
if (((mwait_cstate + 1) > 2) &&
!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halts in idle"