Merge back cpufreq material for 6.12

2024-11-12 23:23:03 +00:00 · 2024-09-05 13:04:20 +02:00 · 2024-09-05 13:04:20 +02:00 · 287f97a151
commit 287f97a151
parent b02d2cf5b2 929ebc93cc
10 changed files with 345 additions and 35 deletions
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@ -282,9 +282,22 @@ static inline long arch_scale_freq_capacity(int cpu)
 }
 #define arch_scale_freq_capacity arch_scale_freq_capacity

+bool arch_enable_hybrid_capacity_scale(void);
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+			   unsigned long cap_freq, unsigned long base_freq);
+
+unsigned long arch_scale_cpu_capacity(int cpu);
+#define arch_scale_cpu_capacity arch_scale_cpu_capacity
+
 extern void arch_set_max_freq_ratio(bool turbo_disabled);
 extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
 #else
+static inline bool arch_enable_hybrid_capacity_scale(void) { return false; }
+static inline void arch_set_cpu_capacity(int cpu, unsigned long cap,
+					 unsigned long max_cap,
+					 unsigned long cap_freq,
+					 unsigned long base_freq) { }
+
 static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
 static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
 #endif
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@ -349,9 +349,89 @@ static DECLARE_WORK(disable_freq_invariance_work,
 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);

+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+	unsigned long capacity;
+	unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+	int cpu;
+
+	if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+		WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+		return true;
+	}
+
+	arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+	if (!arch_cpu_scale)
+		return false;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+		per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+	}
+
+	static_branch_enable(&arch_hybrid_cap_scale_key);
+
+	pr_info("Hybrid CPU capacity scaling enabled\n");
+
+	return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter.  Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+			   unsigned long cap_freq, unsigned long base_freq)
+{
+	if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+		WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+			   div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+		WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+			   div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+	} else {
+		WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+	}
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+	if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+		return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+	return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
+
 static void scale_freq_tick(u64 acnt, u64 mcnt)
 {
-	u64 freq_scale;
+	u64 freq_scale, freq_ratio;

 	if (!arch_scale_freq_invariant())
 		return;
@ -359,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt)
 	if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
 		goto error;

-	if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+	if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+		freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+	else
+		freq_ratio = arch_max_freq_ratio;
+
+	if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
 		goto error;

 	freq_scale = div64_u64(acnt, mcnt);
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@ -575,30 +575,11 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy)
 		return policy->transition_delay_us;

 	latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-	if (latency) {
-		unsigned int max_delay_us = 2 * MSEC_PER_SEC;
+	if (latency)
+		/* Give a 50% breathing room between updates */
+		return latency + (latency >> 1);

-		/*
-		 * If the platform already has high transition_latency, use it
-		 * as-is.
-		 */
-		if (latency > max_delay_us)
-			return latency;
-
-		/*
-		 * For platforms that can change the frequency very fast (< 2
-		 * us), the above formula gives a decent transition delay. But
-		 * for platforms where transition_latency is in milliseconds, it
-		 * ends up giving unrealistic values.
-		 *
-		 * Cap the default transition delay to 2 ms, which seems to be
-		 * a reasonable amount of time after which we should reevaluate
-		 * the frequency.
-		 */
-		return min(latency * LATENCY_MULTIPLIER, max_delay_us);
-	}
-
-	return LATENCY_MULTIPLIER;
+	return USEC_PER_MSEC;
 }
 EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);

--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@ -16,6 +16,7 @@
 #include <linux/tick.h>
 #include <linux/slab.h>
 #include <linux/sched/cpufreq.h>
+#include <linux/sched/smt.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
@ -215,6 +216,7 @@ struct global_params {
 * @hwp_req_cached:	Cached value of the last HWP Request MSR
 * @hwp_cap_cached:	Cached value of the last HWP Capabilities MSR
 * @last_io_update:	Last time when IO wake flag was set
+ * @capacity_perf:	Highest perf used for scale invariance
 * @sched_flags:	Store scheduler flags for possible cross CPU update
 * @hwp_boost_min:	Last HWP boosted min performance
 * @suspended:		Whether or not the driver has been suspended.
@ -253,6 +255,7 @@ struct cpudata {
 	u64 hwp_req_cached;
 	u64 hwp_cap_cached;
 	u64 last_io_update;
+	unsigned int capacity_perf;
 	unsigned int sched_flags;
 	u32 hwp_boost_min;
 	bool suspended;
@ -295,6 +298,7 @@ static int hwp_mode_bdw __ro_after_init;
 static bool per_cpu_limits __ro_after_init;
 static bool hwp_forced __ro_after_init;
 static bool hwp_boost __read_mostly;
+static bool hwp_is_hybrid;

 static struct cpufreq_driver *intel_pstate_driver __read_mostly;

@ -934,6 +938,139 @@ static struct freq_attr *hwp_cpufreq_attrs[] = {
 	NULL,
 };

+static struct cpudata *hybrid_max_perf_cpu __read_mostly;
+/*
+ * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
+ * and the x86 arch scale-invariance information from concurrent updates.
+ */
+static DEFINE_MUTEX(hybrid_capacity_lock);
+
+static void hybrid_set_cpu_capacity(struct cpudata *cpu)
+{
+	arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf,
+			      hybrid_max_perf_cpu->capacity_perf,
+			      cpu->capacity_perf,
+			      cpu->pstate.max_pstate_physical);
+
+	pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu,
+		 cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf,
+		 cpu->pstate.max_pstate_physical);
+}
+
+static void hybrid_clear_cpu_capacity(unsigned int cpunum)
+{
+	arch_set_cpu_capacity(cpunum, 1, 1, 1, 1);
+}
+
+static void hybrid_get_capacity_perf(struct cpudata *cpu)
+{
+	if (READ_ONCE(global.no_turbo)) {
+		cpu->capacity_perf = cpu->pstate.max_pstate_physical;
+		return;
+	}
+
+	cpu->capacity_perf = HWP_HIGHEST_PERF(READ_ONCE(cpu->hwp_cap_cached));
+}
+
+static void hybrid_set_capacity_of_cpus(void)
+{
+	int cpunum;
+
+	for_each_online_cpu(cpunum) {
+		struct cpudata *cpu = all_cpu_data[cpunum];
+
+		if (cpu)
+			hybrid_set_cpu_capacity(cpu);
+	}
+}
+
+static void hybrid_update_cpu_capacity_scaling(void)
+{
+	struct cpudata *max_perf_cpu = NULL;
+	unsigned int max_cap_perf = 0;
+	int cpunum;
+
+	for_each_online_cpu(cpunum) {
+		struct cpudata *cpu = all_cpu_data[cpunum];
+
+		if (!cpu)
+			continue;
+
+		/*
+		 * During initialization, CPU performance at full capacity needs
+		 * to be determined.
+		 */
+		if (!hybrid_max_perf_cpu)
+			hybrid_get_capacity_perf(cpu);
+
+		/*
+		 * If hybrid_max_perf_cpu is not NULL at this point, it is
+		 * being replaced, so don't take it into account when looking
+		 * for the new one.
+		 */
+		if (cpu == hybrid_max_perf_cpu)
+			continue;
+
+		if (cpu->capacity_perf > max_cap_perf) {
+			max_cap_perf = cpu->capacity_perf;
+			max_perf_cpu = cpu;
+		}
+	}
+
+	if (max_perf_cpu) {
+		hybrid_max_perf_cpu = max_perf_cpu;
+		hybrid_set_capacity_of_cpus();
+	} else {
+		pr_info("Found no CPUs with nonzero maximum performance\n");
+		/* Revert to the flat CPU capacity structure. */
+		for_each_online_cpu(cpunum)
+			hybrid_clear_cpu_capacity(cpunum);
+	}
+}
+
+static void __hybrid_init_cpu_capacity_scaling(void)
+{
+	hybrid_max_perf_cpu = NULL;
+	hybrid_update_cpu_capacity_scaling();
+}
+
+static void hybrid_init_cpu_capacity_scaling(void)
+{
+	bool disable_itmt = false;
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	/*
+	 * If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity
+	 * scaling has been enabled already and the driver is just changing the
+	 * operation mode.
+	 */
+	if (hybrid_max_perf_cpu) {
+		__hybrid_init_cpu_capacity_scaling();
+		goto unlock;
+	}
+
+	/*
+	 * On hybrid systems, use asym capacity instead of ITMT, but because
+	 * the capacity of SMT threads is not deterministic even approximately,
+	 * do not do that when SMT is in use.
+	 */
+	if (hwp_is_hybrid && !sched_smt_active() && arch_enable_hybrid_capacity_scale()) {
+		__hybrid_init_cpu_capacity_scaling();
+		disable_itmt = true;
+	}
+
+unlock:
+	mutex_unlock(&hybrid_capacity_lock);
+
+	/*
+	 * Disabling ITMT causes sched domains to be rebuilt to disable asym
+	 * packing and enable asym capacity.
+	 */
+	if (disable_itmt)
+		sched_clear_itmt_support();
+}
+
 static void __intel_pstate_get_hwp_cap(struct cpudata *cpu)
 {
 	u64 cap;
@ -962,6 +1099,43 @@ static void intel_pstate_get_hwp_cap(struct cpudata *cpu)
 	}
 }

+static void hybrid_update_capacity(struct cpudata *cpu)
+{
+	unsigned int max_cap_perf;
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	if (!hybrid_max_perf_cpu)
+		goto unlock;
+
+	/*
+	 * The maximum performance of the CPU may have changed, but assume
+	 * that the performance of the other CPUs has not changed.
+	 */
+	max_cap_perf = hybrid_max_perf_cpu->capacity_perf;
+
+	intel_pstate_get_hwp_cap(cpu);
+
+	hybrid_get_capacity_perf(cpu);
+	/* Should hybrid_max_perf_cpu be replaced by this CPU? */
+	if (cpu->capacity_perf > max_cap_perf) {
+		hybrid_max_perf_cpu = cpu;
+		hybrid_set_capacity_of_cpus();
+		goto unlock;
+	}
+
+	/* If this CPU is hybrid_max_perf_cpu, should it be replaced? */
+	if (cpu == hybrid_max_perf_cpu && cpu->capacity_perf < max_cap_perf) {
+		hybrid_update_cpu_capacity_scaling();
+		goto unlock;
+	}
+
+	hybrid_set_cpu_capacity(cpu);
+
+unlock:
+	mutex_unlock(&hybrid_capacity_lock);
+}
+
 static void intel_pstate_hwp_set(unsigned int cpu)
 {
 	struct cpudata *cpu_data = all_cpu_data[cpu];
@ -1070,6 +1244,22 @@ static void intel_pstate_hwp_offline(struct cpudata *cpu)
 		value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE);

 	wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	if (!hybrid_max_perf_cpu) {
+		mutex_unlock(&hybrid_capacity_lock);
+
+		return;
+	}
+
+	if (hybrid_max_perf_cpu == cpu)
+		hybrid_update_cpu_capacity_scaling();
+
+	mutex_unlock(&hybrid_capacity_lock);
+
+	/* Reset the capacity of the CPU going offline to the initial value. */
+	hybrid_clear_cpu_capacity(cpu->cpu);
 }

 #define POWER_CTL_EE_ENABLE	1
@ -1165,21 +1355,46 @@ static void __intel_pstate_update_max_freq(struct cpudata *cpudata,
 static void intel_pstate_update_limits(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
+	struct cpudata *cpudata;

 	if (!policy)
 		return;

-	__intel_pstate_update_max_freq(all_cpu_data[cpu], policy);
+	cpudata = all_cpu_data[cpu];
+
+	__intel_pstate_update_max_freq(cpudata, policy);
+
+	/* Prevent the driver from being unregistered now. */
+	mutex_lock(&intel_pstate_driver_lock);

 	cpufreq_cpu_release(policy);
+
+	hybrid_update_capacity(cpudata);
+
+	mutex_unlock(&intel_pstate_driver_lock);
 }

 static void intel_pstate_update_limits_for_all(void)
 {
 	int cpu;

-	for_each_possible_cpu(cpu)
-		intel_pstate_update_limits(cpu);
+	for_each_possible_cpu(cpu) {
+		struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
+
+		if (!policy)
+			continue;
+
+		__intel_pstate_update_max_freq(all_cpu_data[cpu], policy);
+
+		cpufreq_cpu_release(policy);
+	}
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	if (hybrid_max_perf_cpu)
+		__hybrid_init_cpu_capacity_scaling();
+
+	mutex_unlock(&hybrid_capacity_lock);
 }

 /************************** sysfs begin ************************/
@ -1618,6 +1833,13 @@ static void intel_pstate_notify_work(struct work_struct *work)
 		__intel_pstate_update_max_freq(cpudata, policy);

 		cpufreq_cpu_release(policy);
+
+		/*
+		 * The driver will not be unregistered while this function is
+		 * running, so update the capacity without acquiring the driver
+		 * lock.
+		 */
+		hybrid_update_capacity(cpudata);
 	}

 	wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
@ -2034,8 +2256,10 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)

 		if (pstate_funcs.get_cpu_scaling) {
 			cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu);
-			if (cpu->pstate.scaling != perf_ctl_scaling)
+			if (cpu->pstate.scaling != perf_ctl_scaling) {
 				intel_pstate_hybrid_hwp_adjust(cpu);
+				hwp_is_hybrid = true;
+			}
 		} else {
 			cpu->pstate.scaling = perf_ctl_scaling;
 		}
@ -2425,6 +2649,10 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
 	X86_MATCH(INTEL_ICELAKE_X,		core_funcs),
 	X86_MATCH(INTEL_SAPPHIRERAPIDS_X,	core_funcs),
 	X86_MATCH(INTEL_EMERALDRAPIDS_X,	core_funcs),
+	X86_MATCH(INTEL_GRANITERAPIDS_D,	core_funcs),
+	X86_MATCH(INTEL_GRANITERAPIDS_X,	core_funcs),
+	X86_MATCH(INTEL_ATOM_CRESTMONT,		core_funcs),
+	X86_MATCH(INTEL_ATOM_CRESTMONT_X,	core_funcs),
 	{}
 };
 #endif
@ -2703,6 +2931,8 @@ static int intel_pstate_cpu_online(struct cpufreq_policy *policy)
 		 */
 		intel_pstate_hwp_reenable(cpu);
 		cpu->suspended = false;
+
+		hybrid_update_capacity(cpu);
 	}

 	return 0;
@ -3143,6 +3373,8 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver)

 	global.min_perf_pct = min_perf_pct_min();

+	hybrid_init_cpu_capacity_scaling();
+
 	return 0;
 }

--- a/drivers/cpufreq/maple-cpufreq.c
+++ b/drivers/cpufreq/maple-cpufreq.c
@ -238,4 +238,5 @@ bail_noprops:
 module_init(maple_cpufreq_init);


+MODULE_DESCRIPTION("cpufreq driver for Maple 970FX/970MP boards");
 MODULE_LICENSE("GPL");
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@ -269,5 +269,6 @@ static void __exit pas_cpufreq_exit(void)
 module_init(pas_cpufreq_init);
 module_exit(pas_cpufreq_exit);

+MODULE_DESCRIPTION("cpufreq driver for PA Semi PWRficient");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Egor Martovetsky <egor@pasemi.com>, Olof Johansson <olof@lixom.net>");
--- a/drivers/cpufreq/pmac64-cpufreq.c
+++ b/drivers/cpufreq/pmac64-cpufreq.c
@ -671,4 +671,5 @@ static int __init g5_cpufreq_init(void)
 module_init(g5_cpufreq_init);


+MODULE_DESCRIPTION("cpufreq driver for SMU & 970FX based G5 Macs");
 MODULE_LICENSE("GPL");
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@ -1160,5 +1160,6 @@ static void __exit powernv_cpufreq_exit(void)
 }
 module_exit(powernv_cpufreq_exit);

+MODULE_DESCRIPTION("cpufreq driver for IBM/OpenPOWER powernv systems");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");
--- a/drivers/cpufreq/ppc_cbe_cpufreq.c
+++ b/drivers/cpufreq/ppc_cbe_cpufreq.c
@ -168,5 +168,6 @@ static void __exit cbe_cpufreq_exit(void)
 module_init(cbe_cpufreq_init);
 module_exit(cbe_cpufreq_exit);

+MODULE_DESCRIPTION("cpufreq driver for Cell BE processors");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>");
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
 #define CPUFREQ_POLICY_POWERSAVE	(1)
 #define CPUFREQ_POLICY_PERFORMANCE	(2)

-/*
- * The polling frequency depends on the capability of the processor. Default
- * polling frequency is 1000 times the transition latency of the processor.
- */
-#define LATENCY_MULTIPLIER		(1000)
-
 struct cpufreq_governor {
 	char	name[CPUFREQ_NAME_LEN];
 	int	(*init)(struct cpufreq_policy *policy);