Merge back cpufreq material for 6.12

This commit is contained in:
Rafael J. Wysocki 2024-09-05 13:04:20 +02:00
commit 287f97a151
10 changed files with 345 additions and 35 deletions

View File

@ -282,9 +282,22 @@ static inline long arch_scale_freq_capacity(int cpu)
}
#define arch_scale_freq_capacity arch_scale_freq_capacity
bool arch_enable_hybrid_capacity_scale(void);
void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
unsigned long cap_freq, unsigned long base_freq);
unsigned long arch_scale_cpu_capacity(int cpu);
#define arch_scale_cpu_capacity arch_scale_cpu_capacity
extern void arch_set_max_freq_ratio(bool turbo_disabled);
extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
#else
static inline bool arch_enable_hybrid_capacity_scale(void) { return false; }
static inline void arch_set_cpu_capacity(int cpu, unsigned long cap,
unsigned long max_cap,
unsigned long cap_freq,
unsigned long base_freq) { }
static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
#endif

View File

@ -349,9 +349,89 @@ static DECLARE_WORK(disable_freq_invariance_work,
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
struct arch_hybrid_cpu_scale {
unsigned long capacity;
unsigned long freq_ratio;
};
static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
/**
* arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
*
* Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
* initialize it and set the static key controlling its code paths.
*
* Must be called before arch_set_cpu_capacity().
*/
bool arch_enable_hybrid_capacity_scale(void)
{
int cpu;
if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
return true;
}
arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
if (!arch_cpu_scale)
return false;
for_each_possible_cpu(cpu) {
per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
}
static_branch_enable(&arch_hybrid_cap_scale_key);
pr_info("Hybrid CPU capacity scaling enabled\n");
return true;
}
/**
* arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
* @cpu: Target CPU.
* @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
* @max_cap: System-wide maximum CPU capacity.
* @cap_freq: Frequency of @cpu corresponding to @cap.
* @base_freq: Frequency of @cpu at which MPERF counts.
*
* The units in which @cap and @max_cap are expressed do not matter, so long
* as they are consistent, because the former is effectively divided by the
* latter. Analogously for @cap_freq and @base_freq.
*
* After calling this function for all CPUs, call arch_rebuild_sched_domains()
* to let the scheduler know that capacity-aware scheduling can be used going
* forward.
*/
void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
unsigned long cap_freq, unsigned long base_freq)
{
if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
} else {
WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
}
}
unsigned long arch_scale_cpu_capacity(int cpu)
{
if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
return SCHED_CAPACITY_SCALE;
}
EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
static void scale_freq_tick(u64 acnt, u64 mcnt)
{
u64 freq_scale;
u64 freq_scale, freq_ratio;
if (!arch_scale_freq_invariant())
return;
@ -359,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt)
if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
goto error;
if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
else
freq_ratio = arch_max_freq_ratio;
if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
goto error;
freq_scale = div64_u64(acnt, mcnt);

View File

@ -575,30 +575,11 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy)
return policy->transition_delay_us;
latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
if (latency) {
unsigned int max_delay_us = 2 * MSEC_PER_SEC;
if (latency)
/* Give a 50% breathing room between updates */
return latency + (latency >> 1);
/*
* If the platform already has high transition_latency, use it
* as-is.
*/
if (latency > max_delay_us)
return latency;
/*
* For platforms that can change the frequency very fast (< 2
* us), the above formula gives a decent transition delay. But
* for platforms where transition_latency is in milliseconds, it
* ends up giving unrealistic values.
*
* Cap the default transition delay to 2 ms, which seems to be
* a reasonable amount of time after which we should reevaluate
* the frequency.
*/
return min(latency * LATENCY_MULTIPLIER, max_delay_us);
}
return LATENCY_MULTIPLIER;
return USEC_PER_MSEC;
}
EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);

View File

@ -16,6 +16,7 @@
#include <linux/tick.h>
#include <linux/slab.h>
#include <linux/sched/cpufreq.h>
#include <linux/sched/smt.h>
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/cpufreq.h>
@ -215,6 +216,7 @@ struct global_params {
* @hwp_req_cached: Cached value of the last HWP Request MSR
* @hwp_cap_cached: Cached value of the last HWP Capabilities MSR
* @last_io_update: Last time when IO wake flag was set
* @capacity_perf: Highest perf used for scale invariance
* @sched_flags: Store scheduler flags for possible cross CPU update
* @hwp_boost_min: Last HWP boosted min performance
* @suspended: Whether or not the driver has been suspended.
@ -253,6 +255,7 @@ struct cpudata {
u64 hwp_req_cached;
u64 hwp_cap_cached;
u64 last_io_update;
unsigned int capacity_perf;
unsigned int sched_flags;
u32 hwp_boost_min;
bool suspended;
@ -295,6 +298,7 @@ static int hwp_mode_bdw __ro_after_init;
static bool per_cpu_limits __ro_after_init;
static bool hwp_forced __ro_after_init;
static bool hwp_boost __read_mostly;
static bool hwp_is_hybrid;
static struct cpufreq_driver *intel_pstate_driver __read_mostly;
@ -934,6 +938,139 @@ static struct freq_attr *hwp_cpufreq_attrs[] = {
NULL,
};
static struct cpudata *hybrid_max_perf_cpu __read_mostly;
/*
* Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
* and the x86 arch scale-invariance information from concurrent updates.
*/
static DEFINE_MUTEX(hybrid_capacity_lock);
static void hybrid_set_cpu_capacity(struct cpudata *cpu)
{
arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf,
hybrid_max_perf_cpu->capacity_perf,
cpu->capacity_perf,
cpu->pstate.max_pstate_physical);
pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu,
cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf,
cpu->pstate.max_pstate_physical);
}
static void hybrid_clear_cpu_capacity(unsigned int cpunum)
{
arch_set_cpu_capacity(cpunum, 1, 1, 1, 1);
}
static void hybrid_get_capacity_perf(struct cpudata *cpu)
{
if (READ_ONCE(global.no_turbo)) {
cpu->capacity_perf = cpu->pstate.max_pstate_physical;
return;
}
cpu->capacity_perf = HWP_HIGHEST_PERF(READ_ONCE(cpu->hwp_cap_cached));
}
static void hybrid_set_capacity_of_cpus(void)
{
int cpunum;
for_each_online_cpu(cpunum) {
struct cpudata *cpu = all_cpu_data[cpunum];
if (cpu)
hybrid_set_cpu_capacity(cpu);
}
}
static void hybrid_update_cpu_capacity_scaling(void)
{
struct cpudata *max_perf_cpu = NULL;
unsigned int max_cap_perf = 0;
int cpunum;
for_each_online_cpu(cpunum) {
struct cpudata *cpu = all_cpu_data[cpunum];
if (!cpu)
continue;
/*
* During initialization, CPU performance at full capacity needs
* to be determined.
*/
if (!hybrid_max_perf_cpu)
hybrid_get_capacity_perf(cpu);
/*
* If hybrid_max_perf_cpu is not NULL at this point, it is
* being replaced, so don't take it into account when looking
* for the new one.
*/
if (cpu == hybrid_max_perf_cpu)
continue;
if (cpu->capacity_perf > max_cap_perf) {
max_cap_perf = cpu->capacity_perf;
max_perf_cpu = cpu;
}
}
if (max_perf_cpu) {
hybrid_max_perf_cpu = max_perf_cpu;
hybrid_set_capacity_of_cpus();
} else {
pr_info("Found no CPUs with nonzero maximum performance\n");
/* Revert to the flat CPU capacity structure. */
for_each_online_cpu(cpunum)
hybrid_clear_cpu_capacity(cpunum);
}
}
static void __hybrid_init_cpu_capacity_scaling(void)
{
hybrid_max_perf_cpu = NULL;
hybrid_update_cpu_capacity_scaling();
}
static void hybrid_init_cpu_capacity_scaling(void)
{
bool disable_itmt = false;
mutex_lock(&hybrid_capacity_lock);
/*
* If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity
* scaling has been enabled already and the driver is just changing the
* operation mode.
*/
if (hybrid_max_perf_cpu) {
__hybrid_init_cpu_capacity_scaling();
goto unlock;
}
/*
* On hybrid systems, use asym capacity instead of ITMT, but because
* the capacity of SMT threads is not deterministic even approximately,
* do not do that when SMT is in use.
*/
if (hwp_is_hybrid && !sched_smt_active() && arch_enable_hybrid_capacity_scale()) {
__hybrid_init_cpu_capacity_scaling();
disable_itmt = true;
}
unlock:
mutex_unlock(&hybrid_capacity_lock);
/*
* Disabling ITMT causes sched domains to be rebuilt to disable asym
* packing and enable asym capacity.
*/
if (disable_itmt)
sched_clear_itmt_support();
}
static void __intel_pstate_get_hwp_cap(struct cpudata *cpu)
{
u64 cap;
@ -962,6 +1099,43 @@ static void intel_pstate_get_hwp_cap(struct cpudata *cpu)
}
}
static void hybrid_update_capacity(struct cpudata *cpu)
{
unsigned int max_cap_perf;
mutex_lock(&hybrid_capacity_lock);
if (!hybrid_max_perf_cpu)
goto unlock;
/*
* The maximum performance of the CPU may have changed, but assume
* that the performance of the other CPUs has not changed.
*/
max_cap_perf = hybrid_max_perf_cpu->capacity_perf;
intel_pstate_get_hwp_cap(cpu);
hybrid_get_capacity_perf(cpu);
/* Should hybrid_max_perf_cpu be replaced by this CPU? */
if (cpu->capacity_perf > max_cap_perf) {
hybrid_max_perf_cpu = cpu;
hybrid_set_capacity_of_cpus();
goto unlock;
}
/* If this CPU is hybrid_max_perf_cpu, should it be replaced? */
if (cpu == hybrid_max_perf_cpu && cpu->capacity_perf < max_cap_perf) {
hybrid_update_cpu_capacity_scaling();
goto unlock;
}
hybrid_set_cpu_capacity(cpu);
unlock:
mutex_unlock(&hybrid_capacity_lock);
}
static void intel_pstate_hwp_set(unsigned int cpu)
{
struct cpudata *cpu_data = all_cpu_data[cpu];
@ -1070,6 +1244,22 @@ static void intel_pstate_hwp_offline(struct cpudata *cpu)
value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE);
wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
mutex_lock(&hybrid_capacity_lock);
if (!hybrid_max_perf_cpu) {
mutex_unlock(&hybrid_capacity_lock);
return;
}
if (hybrid_max_perf_cpu == cpu)
hybrid_update_cpu_capacity_scaling();
mutex_unlock(&hybrid_capacity_lock);
/* Reset the capacity of the CPU going offline to the initial value. */
hybrid_clear_cpu_capacity(cpu->cpu);
}
#define POWER_CTL_EE_ENABLE 1
@ -1165,21 +1355,46 @@ static void __intel_pstate_update_max_freq(struct cpudata *cpudata,
static void intel_pstate_update_limits(unsigned int cpu)
{
struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
struct cpudata *cpudata;
if (!policy)
return;
__intel_pstate_update_max_freq(all_cpu_data[cpu], policy);
cpudata = all_cpu_data[cpu];
__intel_pstate_update_max_freq(cpudata, policy);
/* Prevent the driver from being unregistered now. */
mutex_lock(&intel_pstate_driver_lock);
cpufreq_cpu_release(policy);
hybrid_update_capacity(cpudata);
mutex_unlock(&intel_pstate_driver_lock);
}
static void intel_pstate_update_limits_for_all(void)
{
int cpu;
for_each_possible_cpu(cpu)
intel_pstate_update_limits(cpu);
for_each_possible_cpu(cpu) {
struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
if (!policy)
continue;
__intel_pstate_update_max_freq(all_cpu_data[cpu], policy);
cpufreq_cpu_release(policy);
}
mutex_lock(&hybrid_capacity_lock);
if (hybrid_max_perf_cpu)
__hybrid_init_cpu_capacity_scaling();
mutex_unlock(&hybrid_capacity_lock);
}
/************************** sysfs begin ************************/
@ -1618,6 +1833,13 @@ static void intel_pstate_notify_work(struct work_struct *work)
__intel_pstate_update_max_freq(cpudata, policy);
cpufreq_cpu_release(policy);
/*
* The driver will not be unregistered while this function is
* running, so update the capacity without acquiring the driver
* lock.
*/
hybrid_update_capacity(cpudata);
}
wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
@ -2034,8 +2256,10 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
if (pstate_funcs.get_cpu_scaling) {
cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu);
if (cpu->pstate.scaling != perf_ctl_scaling)
if (cpu->pstate.scaling != perf_ctl_scaling) {
intel_pstate_hybrid_hwp_adjust(cpu);
hwp_is_hybrid = true;
}
} else {
cpu->pstate.scaling = perf_ctl_scaling;
}
@ -2425,6 +2649,10 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
X86_MATCH(INTEL_ICELAKE_X, core_funcs),
X86_MATCH(INTEL_SAPPHIRERAPIDS_X, core_funcs),
X86_MATCH(INTEL_EMERALDRAPIDS_X, core_funcs),
X86_MATCH(INTEL_GRANITERAPIDS_D, core_funcs),
X86_MATCH(INTEL_GRANITERAPIDS_X, core_funcs),
X86_MATCH(INTEL_ATOM_CRESTMONT, core_funcs),
X86_MATCH(INTEL_ATOM_CRESTMONT_X, core_funcs),
{}
};
#endif
@ -2703,6 +2931,8 @@ static int intel_pstate_cpu_online(struct cpufreq_policy *policy)
*/
intel_pstate_hwp_reenable(cpu);
cpu->suspended = false;
hybrid_update_capacity(cpu);
}
return 0;
@ -3143,6 +3373,8 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver)
global.min_perf_pct = min_perf_pct_min();
hybrid_init_cpu_capacity_scaling();
return 0;
}

View File

@ -238,4 +238,5 @@ bail_noprops:
module_init(maple_cpufreq_init);
MODULE_DESCRIPTION("cpufreq driver for Maple 970FX/970MP boards");
MODULE_LICENSE("GPL");

View File

@ -269,5 +269,6 @@ static void __exit pas_cpufreq_exit(void)
module_init(pas_cpufreq_init);
module_exit(pas_cpufreq_exit);
MODULE_DESCRIPTION("cpufreq driver for PA Semi PWRficient");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Egor Martovetsky <egor@pasemi.com>, Olof Johansson <olof@lixom.net>");

View File

@ -671,4 +671,5 @@ static int __init g5_cpufreq_init(void)
module_init(g5_cpufreq_init);
MODULE_DESCRIPTION("cpufreq driver for SMU & 970FX based G5 Macs");
MODULE_LICENSE("GPL");

View File

@ -1160,5 +1160,6 @@ static void __exit powernv_cpufreq_exit(void)
}
module_exit(powernv_cpufreq_exit);
MODULE_DESCRIPTION("cpufreq driver for IBM/OpenPOWER powernv systems");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");

View File

@ -168,5 +168,6 @@ static void __exit cbe_cpufreq_exit(void)
module_init(cbe_cpufreq_init);
module_exit(cbe_cpufreq_exit);
MODULE_DESCRIPTION("cpufreq driver for Cell BE processors");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>");

View File

@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
#define CPUFREQ_POLICY_POWERSAVE (1)
#define CPUFREQ_POLICY_PERFORMANCE (2)
/*
* The polling frequency depends on the capability of the processor. Default
* polling frequency is 1000 times the transition latency of the processor.
*/
#define LATENCY_MULTIPLIER (1000)
struct cpufreq_governor {
char name[CPUFREQ_NAME_LEN];
int (*init)(struct cpufreq_policy *policy);