2017-11-07 16:30:08 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2017-05-31 16:59:28 +00:00
|
|
|
/*
|
|
|
|
* Arch specific cpu topology information
|
|
|
|
*
|
|
|
|
* Copyright (C) 2016, ARM Ltd.
|
|
|
|
* Written by: Juri Lelli, ARM Ltd.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/acpi.h>
|
2022-07-04 10:15:53 +00:00
|
|
|
#include <linux/cacheinfo.h>
|
2017-05-31 16:59:28 +00:00
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/cpufreq.h>
|
|
|
|
#include <linux/device.h>
|
|
|
|
#include <linux/of.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/sched/topology.h>
|
2018-07-20 13:32:32 +00:00
|
|
|
#include <linux/cpuset.h>
|
2019-06-27 19:52:58 +00:00
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/init.h>
|
2021-06-15 08:57:50 +00:00
|
|
|
#include <linux/rcupdate.h>
|
2019-06-27 19:52:58 +00:00
|
|
|
#include <linux/sched.h>
|
2023-12-11 10:48:49 +00:00
|
|
|
#include <linux/units.h>
|
2017-05-31 16:59:28 +00:00
|
|
|
|
2022-04-27 08:08:06 +00:00
|
|
|
#define CREATE_TRACE_POINTS
|
|
|
|
#include <trace/events/thermal_pressure.h>
|
|
|
|
|
2021-06-15 08:57:50 +00:00
|
|
|
static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
|
2021-03-10 02:51:04 +00:00
|
|
|
static struct cpumask scale_freq_counters_mask;
|
|
|
|
static bool scale_freq_invariant;
|
2023-12-11 10:48:49 +00:00
|
|
|
DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1;
|
|
|
|
EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref);
|
2021-03-10 02:51:04 +00:00
|
|
|
|
|
|
|
static bool supports_scale_freq_counters(const struct cpumask *cpus)
|
|
|
|
{
|
|
|
|
return cpumask_subset(cpus, &scale_freq_counters_mask);
|
|
|
|
}
|
|
|
|
|
arch_topology, arm, arm64: define arch_scale_freq_invariant()
arch_scale_freq_invariant() is used by schedutil to determine whether
the scheduler's load-tracking signals are frequency invariant. Its
definition is overridable, though by default it is hardcoded to 'true'
if arch_scale_freq_capacity() is defined ('false' otherwise).
This behaviour is not overridden on arm, arm64 and other users of the
generic arch topology driver, which is somewhat precarious:
arch_scale_freq_capacity() will always be defined, yet not all cpufreq
drivers are guaranteed to drive the frequency invariance scale factor
setting. In other words, the load-tracking signals may very well *not*
be frequency invariant.
Now that cpufreq can be queried on whether the current driver is driving
the Frequency Invariance (FI) scale setting, the current situation can
be improved. This combines the query of whether cpufreq supports the
setting of the frequency scale factor, with whether all online CPUs are
counter-based FI enabled.
While cpufreq FI enablement applies at system level, for all CPUs,
counter-based FI support could also be used for only a subset of CPUs to
set the invariance scale factor. Therefore, if cpufreq-based FI support
is present, we consider the system to be invariant. If missing, we
require all online CPUs to be counter-based FI enabled in order for the
full system to be considered invariant.
If the system ends up not being invariant, a new condition is needed in
the counter initialization code that disables all scale factor setting
based on counters.
Precedence of counters over cpufreq use is not important here. The
invariant status is only given to the system if all CPUs have at least
one method of setting the frequency scale factor.
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2020-09-01 20:55:49 +00:00
|
|
|
bool topology_scale_freq_invariant(void)
|
|
|
|
{
|
|
|
|
return cpufreq_supports_freq_invariance() ||
|
2021-03-10 02:51:04 +00:00
|
|
|
supports_scale_freq_counters(cpu_online_mask);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void update_scale_freq_invariant(bool status)
|
|
|
|
{
|
|
|
|
if (scale_freq_invariant == status)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Task scheduler behavior depends on frequency invariance support,
|
|
|
|
* either cpufreq or counter driven. If the support status changes as
|
|
|
|
* a result of counter initialisation and use, retrigger the build of
|
|
|
|
* scheduling domains to ensure the information is propagated properly.
|
|
|
|
*/
|
|
|
|
if (topology_scale_freq_invariant() == status) {
|
|
|
|
scale_freq_invariant = status;
|
|
|
|
rebuild_sched_domains_energy();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void topology_set_scale_freq_source(struct scale_freq_data *data,
|
|
|
|
const struct cpumask *cpus)
|
|
|
|
{
|
|
|
|
struct scale_freq_data *sfd;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Avoid calling rebuild_sched_domains() unnecessarily if FIE is
|
|
|
|
* supported by cpufreq.
|
|
|
|
*/
|
|
|
|
if (cpumask_empty(&scale_freq_counters_mask))
|
|
|
|
scale_freq_invariant = topology_scale_freq_invariant();
|
|
|
|
|
2021-06-15 08:57:50 +00:00
|
|
|
rcu_read_lock();
|
|
|
|
|
2021-03-10 02:51:04 +00:00
|
|
|
for_each_cpu(cpu, cpus) {
|
2021-06-15 08:57:50 +00:00
|
|
|
sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu));
|
2021-03-10 02:51:04 +00:00
|
|
|
|
|
|
|
/* Use ARCH provided counters whenever possible */
|
|
|
|
if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) {
|
2021-06-15 08:57:50 +00:00
|
|
|
rcu_assign_pointer(per_cpu(sft_data, cpu), data);
|
2021-03-10 02:51:04 +00:00
|
|
|
cpumask_set_cpu(cpu, &scale_freq_counters_mask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-15 08:57:50 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
2021-03-10 02:51:04 +00:00
|
|
|
update_scale_freq_invariant(true);
|
arch_topology, arm, arm64: define arch_scale_freq_invariant()
arch_scale_freq_invariant() is used by schedutil to determine whether
the scheduler's load-tracking signals are frequency invariant. Its
definition is overridable, though by default it is hardcoded to 'true'
if arch_scale_freq_capacity() is defined ('false' otherwise).
This behaviour is not overridden on arm, arm64 and other users of the
generic arch topology driver, which is somewhat precarious:
arch_scale_freq_capacity() will always be defined, yet not all cpufreq
drivers are guaranteed to drive the frequency invariance scale factor
setting. In other words, the load-tracking signals may very well *not*
be frequency invariant.
Now that cpufreq can be queried on whether the current driver is driving
the Frequency Invariance (FI) scale setting, the current situation can
be improved. This combines the query of whether cpufreq supports the
setting of the frequency scale factor, with whether all online CPUs are
counter-based FI enabled.
While cpufreq FI enablement applies at system level, for all CPUs,
counter-based FI support could also be used for only a subset of CPUs to
set the invariance scale factor. Therefore, if cpufreq-based FI support
is present, we consider the system to be invariant. If missing, we
require all online CPUs to be counter-based FI enabled in order for the
full system to be considered invariant.
If the system ends up not being invariant, a new condition is needed in
the counter initialization code that disables all scale factor setting
based on counters.
Precedence of counters over cpufreq use is not important here. The
invariant status is only given to the system if all CPUs have at least
one method of setting the frequency scale factor.
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2020-09-01 20:55:49 +00:00
|
|
|
}
|
2021-03-10 02:55:27 +00:00
|
|
|
EXPORT_SYMBOL_GPL(topology_set_scale_freq_source);
|
arch_topology, arm, arm64: define arch_scale_freq_invariant()
arch_scale_freq_invariant() is used by schedutil to determine whether
the scheduler's load-tracking signals are frequency invariant. Its
definition is overridable, though by default it is hardcoded to 'true'
if arch_scale_freq_capacity() is defined ('false' otherwise).
This behaviour is not overridden on arm, arm64 and other users of the
generic arch topology driver, which is somewhat precarious:
arch_scale_freq_capacity() will always be defined, yet not all cpufreq
drivers are guaranteed to drive the frequency invariance scale factor
setting. In other words, the load-tracking signals may very well *not*
be frequency invariant.
Now that cpufreq can be queried on whether the current driver is driving
the Frequency Invariance (FI) scale setting, the current situation can
be improved. This combines the query of whether cpufreq supports the
setting of the frequency scale factor, with whether all online CPUs are
counter-based FI enabled.
While cpufreq FI enablement applies at system level, for all CPUs,
counter-based FI support could also be used for only a subset of CPUs to
set the invariance scale factor. Therefore, if cpufreq-based FI support
is present, we consider the system to be invariant. If missing, we
require all online CPUs to be counter-based FI enabled in order for the
full system to be considered invariant.
If the system ends up not being invariant, a new condition is needed in
the counter initialization code that disables all scale factor setting
based on counters.
Precedence of counters over cpufreq use is not important here. The
invariant status is only given to the system if all CPUs have at least
one method of setting the frequency scale factor.
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2020-09-01 20:55:49 +00:00
|
|
|
|
2021-03-10 02:51:04 +00:00
|
|
|
void topology_clear_scale_freq_source(enum scale_freq_source source,
|
|
|
|
const struct cpumask *cpus)
|
arm64: use activity monitors for frequency invariance
The Frequency Invariance Engine (FIE) is providing a frequency
scaling correction factor that helps achieve more accurate
load-tracking.
So far, for arm and arm64 platforms, this scale factor has been
obtained based on the ratio between the current frequency and the
maximum supported frequency recorded by the cpufreq policy. The
setting of this scale factor is triggered from cpufreq drivers by
calling arch_set_freq_scale. The current frequency used in computation
is the frequency requested by a governor, but it may not be the
frequency that was implemented by the platform.
This correction factor can also be obtained using a core counter and a
constant counter to get information on the performance (frequency based
only) obtained in a period of time. This will more accurately reflect
the actual current frequency of the CPU, compared with the alternative
implementation that reflects the request of a performance level from
the OS.
Therefore, implement arch_scale_freq_tick to use activity monitors, if
present, for the computation of the frequency scale factor.
The use of AMU counters depends on:
- CONFIG_ARM64_AMU_EXTN - depents on the AMU extension being present
- CONFIG_CPU_FREQ - the current frequency obtained using counter
information is divided by the maximum frequency obtained from the
cpufreq policy.
While it is possible to have a combination of CPUs in the system with
and without support for activity monitors, the use of counters for
frequency invariance is only enabled for a CPU if all related CPUs
(CPUs in the same frequency domain) support and have enabled the core
and constant activity monitor counters. In this way, there is a clear
separation between the policies for which arch_set_freq_scale (cpufreq
based FIE) is used, and the policies for which arch_scale_freq_tick
(counter based FIE) is used to set the frequency scale factor. For
this purpose, a late_initcall_sync is registered to trigger validation
work for policies that will enable or disable the use of AMU counters
for frequency invariance. If CONFIG_CPU_FREQ is not defined, the use
of counters is enabled on all CPUs only if all possible CPUs correctly
support the necessary counters.
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-03-05 09:06:26 +00:00
|
|
|
{
|
2021-03-10 02:51:04 +00:00
|
|
|
struct scale_freq_data *sfd;
|
|
|
|
int cpu;
|
|
|
|
|
2021-06-15 08:57:50 +00:00
|
|
|
rcu_read_lock();
|
|
|
|
|
2021-03-10 02:51:04 +00:00
|
|
|
for_each_cpu(cpu, cpus) {
|
2021-06-15 08:57:50 +00:00
|
|
|
sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu));
|
2021-03-10 02:51:04 +00:00
|
|
|
|
|
|
|
if (sfd && sfd->source == source) {
|
2021-06-15 08:57:50 +00:00
|
|
|
rcu_assign_pointer(per_cpu(sft_data, cpu), NULL);
|
2021-03-10 02:51:04 +00:00
|
|
|
cpumask_clear_cpu(cpu, &scale_freq_counters_mask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-15 08:57:50 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure all references to previous sft_data are dropped to avoid
|
|
|
|
* use-after-free races.
|
|
|
|
*/
|
|
|
|
synchronize_rcu();
|
|
|
|
|
2021-03-10 02:51:04 +00:00
|
|
|
update_scale_freq_invariant(false);
|
arm64: use activity monitors for frequency invariance
The Frequency Invariance Engine (FIE) is providing a frequency
scaling correction factor that helps achieve more accurate
load-tracking.
So far, for arm and arm64 platforms, this scale factor has been
obtained based on the ratio between the current frequency and the
maximum supported frequency recorded by the cpufreq policy. The
setting of this scale factor is triggered from cpufreq drivers by
calling arch_set_freq_scale. The current frequency used in computation
is the frequency requested by a governor, but it may not be the
frequency that was implemented by the platform.
This correction factor can also be obtained using a core counter and a
constant counter to get information on the performance (frequency based
only) obtained in a period of time. This will more accurately reflect
the actual current frequency of the CPU, compared with the alternative
implementation that reflects the request of a performance level from
the OS.
Therefore, implement arch_scale_freq_tick to use activity monitors, if
present, for the computation of the frequency scale factor.
The use of AMU counters depends on:
- CONFIG_ARM64_AMU_EXTN - depents on the AMU extension being present
- CONFIG_CPU_FREQ - the current frequency obtained using counter
information is divided by the maximum frequency obtained from the
cpufreq policy.
While it is possible to have a combination of CPUs in the system with
and without support for activity monitors, the use of counters for
frequency invariance is only enabled for a CPU if all related CPUs
(CPUs in the same frequency domain) support and have enabled the core
and constant activity monitor counters. In this way, there is a clear
separation between the policies for which arch_set_freq_scale (cpufreq
based FIE) is used, and the policies for which arch_scale_freq_tick
(counter based FIE) is used to set the frequency scale factor. For
this purpose, a late_initcall_sync is registered to trigger validation
work for policies that will enable or disable the use of AMU counters
for frequency invariance. If CONFIG_CPU_FREQ is not defined, the use
of counters is enabled on all CPUs only if all possible CPUs correctly
support the necessary counters.
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-03-05 09:06:26 +00:00
|
|
|
}
|
2021-03-10 02:55:27 +00:00
|
|
|
EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source);
|
2021-03-10 02:51:04 +00:00
|
|
|
|
|
|
|
void topology_scale_freq_tick(void)
|
|
|
|
{
|
2021-06-15 08:57:50 +00:00
|
|
|
struct scale_freq_data *sfd = rcu_dereference_sched(*this_cpu_ptr(&sft_data));
|
2021-03-10 02:51:04 +00:00
|
|
|
|
|
|
|
if (sfd)
|
|
|
|
sfd->set_freq_scale();
|
|
|
|
}
|
|
|
|
|
2021-03-10 02:46:40 +00:00
|
|
|
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
|
2021-03-10 02:55:27 +00:00
|
|
|
EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
|
2017-05-31 16:59:28 +00:00
|
|
|
|
2020-09-24 12:30:15 +00:00
|
|
|
void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
|
|
|
|
unsigned long max_freq)
|
2017-05-31 16:59:28 +00:00
|
|
|
{
|
2017-09-26 16:41:10 +00:00
|
|
|
unsigned long scale;
|
|
|
|
int i;
|
|
|
|
|
2020-09-01 20:55:45 +00:00
|
|
|
if (WARN_ON_ONCE(!cur_freq || !max_freq))
|
|
|
|
return;
|
|
|
|
|
arm64: use activity monitors for frequency invariance
The Frequency Invariance Engine (FIE) is providing a frequency
scaling correction factor that helps achieve more accurate
load-tracking.
So far, for arm and arm64 platforms, this scale factor has been
obtained based on the ratio between the current frequency and the
maximum supported frequency recorded by the cpufreq policy. The
setting of this scale factor is triggered from cpufreq drivers by
calling arch_set_freq_scale. The current frequency used in computation
is the frequency requested by a governor, but it may not be the
frequency that was implemented by the platform.
This correction factor can also be obtained using a core counter and a
constant counter to get information on the performance (frequency based
only) obtained in a period of time. This will more accurately reflect
the actual current frequency of the CPU, compared with the alternative
implementation that reflects the request of a performance level from
the OS.
Therefore, implement arch_scale_freq_tick to use activity monitors, if
present, for the computation of the frequency scale factor.
The use of AMU counters depends on:
- CONFIG_ARM64_AMU_EXTN - depents on the AMU extension being present
- CONFIG_CPU_FREQ - the current frequency obtained using counter
information is divided by the maximum frequency obtained from the
cpufreq policy.
While it is possible to have a combination of CPUs in the system with
and without support for activity monitors, the use of counters for
frequency invariance is only enabled for a CPU if all related CPUs
(CPUs in the same frequency domain) support and have enabled the core
and constant activity monitor counters. In this way, there is a clear
separation between the policies for which arch_set_freq_scale (cpufreq
based FIE) is used, and the policies for which arch_scale_freq_tick
(counter based FIE) is used to set the frequency scale factor. For
this purpose, a late_initcall_sync is registered to trigger validation
work for policies that will enable or disable the use of AMU counters
for frequency invariance. If CONFIG_CPU_FREQ is not defined, the use
of counters is enabled on all CPUs only if all possible CPUs correctly
support the necessary counters.
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-03-05 09:06:26 +00:00
|
|
|
/*
|
|
|
|
* If the use of counters for FIE is enabled, just return as we don't
|
|
|
|
* want to update the scale factor with information from CPUFREQ.
|
|
|
|
* Instead the scale factor will be updated from arch_scale_freq_tick.
|
|
|
|
*/
|
2021-03-10 02:51:04 +00:00
|
|
|
if (supports_scale_freq_counters(cpus))
|
arm64: use activity monitors for frequency invariance
The Frequency Invariance Engine (FIE) is providing a frequency
scaling correction factor that helps achieve more accurate
load-tracking.
So far, for arm and arm64 platforms, this scale factor has been
obtained based on the ratio between the current frequency and the
maximum supported frequency recorded by the cpufreq policy. The
setting of this scale factor is triggered from cpufreq drivers by
calling arch_set_freq_scale. The current frequency used in computation
is the frequency requested by a governor, but it may not be the
frequency that was implemented by the platform.
This correction factor can also be obtained using a core counter and a
constant counter to get information on the performance (frequency based
only) obtained in a period of time. This will more accurately reflect
the actual current frequency of the CPU, compared with the alternative
implementation that reflects the request of a performance level from
the OS.
Therefore, implement arch_scale_freq_tick to use activity monitors, if
present, for the computation of the frequency scale factor.
The use of AMU counters depends on:
- CONFIG_ARM64_AMU_EXTN - depents on the AMU extension being present
- CONFIG_CPU_FREQ - the current frequency obtained using counter
information is divided by the maximum frequency obtained from the
cpufreq policy.
While it is possible to have a combination of CPUs in the system with
and without support for activity monitors, the use of counters for
frequency invariance is only enabled for a CPU if all related CPUs
(CPUs in the same frequency domain) support and have enabled the core
and constant activity monitor counters. In this way, there is a clear
separation between the policies for which arch_set_freq_scale (cpufreq
based FIE) is used, and the policies for which arch_scale_freq_tick
(counter based FIE) is used to set the frequency scale factor. For
this purpose, a late_initcall_sync is registered to trigger validation
work for policies that will enable or disable the use of AMU counters
for frequency invariance. If CONFIG_CPU_FREQ is not defined, the use
of counters is enabled on all CPUs only if all possible CPUs correctly
support the necessary counters.
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-03-05 09:06:26 +00:00
|
|
|
return;
|
|
|
|
|
2017-09-26 16:41:10 +00:00
|
|
|
scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
|
|
|
|
|
|
|
|
for_each_cpu(i, cpus)
|
2021-03-10 02:46:40 +00:00
|
|
|
per_cpu(arch_freq_scale, i) = scale;
|
2017-05-31 16:59:28 +00:00
|
|
|
}
|
|
|
|
|
2017-09-26 16:41:11 +00:00
|
|
|
DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
|
2021-08-09 19:16:01 +00:00
|
|
|
EXPORT_PER_CPU_SYMBOL_GPL(cpu_scale);
|
2017-05-31 16:59:28 +00:00
|
|
|
|
2017-05-31 16:59:31 +00:00
|
|
|
void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
|
2017-05-31 16:59:28 +00:00
|
|
|
{
|
|
|
|
per_cpu(cpu_scale, cpu) = capacity;
|
|
|
|
}
|
|
|
|
|
2020-07-12 16:59:15 +00:00
|
|
|
DEFINE_PER_CPU(unsigned long, thermal_pressure);
|
|
|
|
|
2021-11-09 19:57:10 +00:00
|
|
|
/**
|
|
|
|
* topology_update_thermal_pressure() - Update thermal pressure for CPUs
|
|
|
|
* @cpus : The related CPUs for which capacity has been reduced
|
|
|
|
* @capped_freq : The maximum allowed frequency that CPUs can run at
|
|
|
|
*
|
|
|
|
* Update the value of thermal pressure for all @cpus in the mask. The
|
|
|
|
* cpumask should include all (online+offline) affected CPUs, to avoid
|
|
|
|
* operating on stale data when hot-plug is used for some CPUs. The
|
|
|
|
* @capped_freq reflects the currently allowed max CPUs frequency due to
|
|
|
|
* thermal capping. It might be also a boost frequency value, which is bigger
|
2023-12-11 10:48:49 +00:00
|
|
|
* than the internal 'capacity_freq_ref' max frequency. In such case the
|
|
|
|
* pressure value should simply be removed, since this is an indication that
|
|
|
|
* there is no thermal throttling. The @capped_freq must be provided in kHz.
|
2021-11-09 19:57:10 +00:00
|
|
|
*/
|
|
|
|
void topology_update_thermal_pressure(const struct cpumask *cpus,
|
|
|
|
unsigned long capped_freq)
|
|
|
|
{
|
2021-11-09 19:57:14 +00:00
|
|
|
unsigned long max_capacity, capacity, th_pressure;
|
2021-11-09 19:57:10 +00:00
|
|
|
u32 max_freq;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
cpu = cpumask_first(cpus);
|
|
|
|
max_capacity = arch_scale_cpu_capacity(cpu);
|
2023-12-11 10:48:49 +00:00
|
|
|
max_freq = arch_scale_freq_ref(cpu);
|
2021-11-09 19:57:10 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle properly the boost frequencies, which should simply clean
|
|
|
|
* the thermal pressure value.
|
|
|
|
*/
|
|
|
|
if (max_freq <= capped_freq)
|
|
|
|
capacity = max_capacity;
|
|
|
|
else
|
|
|
|
capacity = mult_frac(max_capacity, capped_freq, max_freq);
|
|
|
|
|
2021-11-09 19:57:14 +00:00
|
|
|
th_pressure = max_capacity - capacity;
|
|
|
|
|
2022-04-27 08:08:06 +00:00
|
|
|
trace_thermal_pressure_update(cpu, th_pressure);
|
|
|
|
|
2021-11-09 19:57:14 +00:00
|
|
|
for_each_cpu(cpu, cpus)
|
|
|
|
WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
|
2021-11-09 19:57:10 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);
|
|
|
|
|
2017-05-31 16:59:28 +00:00
|
|
|
static ssize_t cpu_capacity_show(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct cpu *cpu = container_of(dev, struct cpu, dev);
|
|
|
|
|
drivers core: Use sysfs_emit and sysfs_emit_at for show(device *...) functions
Convert the various sprintf fmaily calls in sysfs device show functions
to sysfs_emit and sysfs_emit_at for PAGE_SIZE buffer safety.
Done with:
$ spatch -sp-file sysfs_emit_dev.cocci --in-place --max-width=80 .
And cocci script:
$ cat sysfs_emit_dev.cocci
@@
identifier d_show;
identifier dev, attr, buf;
@@
ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf)
{
<...
return
- sprintf(buf,
+ sysfs_emit(buf,
...);
...>
}
@@
identifier d_show;
identifier dev, attr, buf;
@@
ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf)
{
<...
return
- snprintf(buf, PAGE_SIZE,
+ sysfs_emit(buf,
...);
...>
}
@@
identifier d_show;
identifier dev, attr, buf;
@@
ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf)
{
<...
return
- scnprintf(buf, PAGE_SIZE,
+ sysfs_emit(buf,
...);
...>
}
@@
identifier d_show;
identifier dev, attr, buf;
expression chr;
@@
ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf)
{
<...
return
- strcpy(buf, chr);
+ sysfs_emit(buf, chr);
...>
}
@@
identifier d_show;
identifier dev, attr, buf;
identifier len;
@@
ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf)
{
<...
len =
- sprintf(buf,
+ sysfs_emit(buf,
...);
...>
return len;
}
@@
identifier d_show;
identifier dev, attr, buf;
identifier len;
@@
ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf)
{
<...
len =
- snprintf(buf, PAGE_SIZE,
+ sysfs_emit(buf,
...);
...>
return len;
}
@@
identifier d_show;
identifier dev, attr, buf;
identifier len;
@@
ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf)
{
<...
len =
- scnprintf(buf, PAGE_SIZE,
+ sysfs_emit(buf,
...);
...>
return len;
}
@@
identifier d_show;
identifier dev, attr, buf;
identifier len;
@@
ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf)
{
<...
- len += scnprintf(buf + len, PAGE_SIZE - len,
+ len += sysfs_emit_at(buf, len,
...);
...>
return len;
}
@@
identifier d_show;
identifier dev, attr, buf;
expression chr;
@@
ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf)
{
...
- strcpy(buf, chr);
- return strlen(buf);
+ return sysfs_emit(buf, chr);
}
Signed-off-by: Joe Perches <joe@perches.com>
Link: https://lore.kernel.org/r/3d033c33056d88bbe34d4ddb62afd05ee166ab9a.1600285923.git.joe@perches.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-09-16 20:40:39 +00:00
|
|
|
return sysfs_emit(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id));
|
2017-05-31 16:59:28 +00:00
|
|
|
}
|
|
|
|
|
2018-07-20 13:32:32 +00:00
|
|
|
static void update_topology_flags_workfn(struct work_struct *work);
|
|
|
|
static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn);
|
|
|
|
|
arch_topology: Make cpu_capacity sysfs node as read-only
If user updates any cpu's cpu_capacity, then the new value is going to
be applied to all its online sibling cpus. But this need not to be correct
always, as sibling cpus (in ARM, same micro architecture cpus) would have
different cpu_capacity with different performance characteristics.
So, updating the user supplied cpu_capacity to all cpu siblings
is not correct.
And another problem is, current code assumes that 'all cpus in a cluster
or with same package_id (core_siblings), would have same cpu_capacity'.
But with commit '5bdd2b3f0f8 ("arm64: topology: add support to remove
cpu topology sibling masks")', when a cpu hotplugged out, the cpu
information gets cleared in its sibling cpus. So, user supplied
cpu_capacity would be applied to only online sibling cpus at the time.
After that, if any cpu hotplugged in, it would have different cpu_capacity
than its siblings, which breaks the above assumption.
So, instead of mucking around the core sibling mask for user supplied
value, use device-tree to set cpu capacity. And make the cpu_capacity
node as read-only to know the asymmetry between cpus in the system.
While at it, remove cpu_scale_mutex usage, which used for sysfs write
protection.
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Quentin Perret <quentin.perret@arm.com>
Reviewed-by: Quentin Perret <quentin.perret@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Lingutla Chandrasekhar <clingutla@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-04-01 04:24:41 +00:00
|
|
|
static DEVICE_ATTR_RO(cpu_capacity);
|
2017-05-31 16:59:28 +00:00
|
|
|
|
|
|
|
static int register_cpu_capacity_sysctl(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct device *cpu;
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
cpu = get_cpu_device(i);
|
|
|
|
if (!cpu) {
|
|
|
|
pr_err("%s: too early to get CPU%d device!\n",
|
|
|
|
__func__, i);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
device_create_file(cpu, &dev_attr_cpu_capacity);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
subsys_initcall(register_cpu_capacity_sysctl);
|
|
|
|
|
2018-07-20 13:32:32 +00:00
|
|
|
static int update_topology;
|
|
|
|
|
|
|
|
int topology_update_cpu_topology(void)
|
|
|
|
{
|
|
|
|
return update_topology;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Updating the sched_domains can't be done directly from cpufreq callbacks
|
|
|
|
* due to locking, so queue the work for later.
|
|
|
|
*/
|
|
|
|
static void update_topology_flags_workfn(struct work_struct *work)
|
|
|
|
{
|
|
|
|
update_topology = 1;
|
|
|
|
rebuild_sched_domains();
|
|
|
|
pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
|
|
|
|
update_topology = 0;
|
|
|
|
}
|
|
|
|
|
2017-05-31 16:59:28 +00:00
|
|
|
static u32 *raw_capacity;
|
2017-06-23 09:25:33 +00:00
|
|
|
|
2017-10-10 07:34:56 +00:00
|
|
|
static int free_raw_capacity(void)
|
2017-06-23 09:25:33 +00:00
|
|
|
{
|
|
|
|
kfree(raw_capacity);
|
|
|
|
raw_capacity = NULL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2017-05-31 16:59:28 +00:00
|
|
|
|
2017-05-31 16:59:31 +00:00
|
|
|
void topology_normalize_cpu_scale(void)
|
2017-05-31 16:59:28 +00:00
|
|
|
{
|
|
|
|
u64 capacity;
|
2020-01-13 03:48:15 +00:00
|
|
|
u64 capacity_scale;
|
2017-05-31 16:59:28 +00:00
|
|
|
int cpu;
|
|
|
|
|
2017-06-23 09:25:33 +00:00
|
|
|
if (!raw_capacity)
|
2017-05-31 16:59:28 +00:00
|
|
|
return;
|
|
|
|
|
2020-01-13 03:48:15 +00:00
|
|
|
capacity_scale = 1;
|
2017-05-31 16:59:28 +00:00
|
|
|
for_each_possible_cpu(cpu) {
|
2023-12-11 10:48:49 +00:00
|
|
|
capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu);
|
2020-01-13 03:48:15 +00:00
|
|
|
capacity_scale = max(capacity, capacity_scale);
|
|
|
|
}
|
|
|
|
|
|
|
|
pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale);
|
|
|
|
for_each_possible_cpu(cpu) {
|
2023-12-11 10:48:49 +00:00
|
|
|
capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu);
|
2020-01-13 03:48:15 +00:00
|
|
|
capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT,
|
|
|
|
capacity_scale);
|
2017-05-31 16:59:31 +00:00
|
|
|
topology_set_cpu_scale(cpu, capacity);
|
2017-05-31 16:59:28 +00:00
|
|
|
pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
|
2019-06-17 15:00:17 +00:00
|
|
|
cpu, topology_get_cpu_scale(cpu));
|
2017-05-31 16:59:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-23 09:25:32 +00:00
|
|
|
bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
|
2017-05-31 16:59:28 +00:00
|
|
|
{
|
2020-01-13 03:48:15 +00:00
|
|
|
struct clk *cpu_clk;
|
2017-06-23 09:25:33 +00:00
|
|
|
static bool cap_parsing_failed;
|
2017-06-23 09:25:32 +00:00
|
|
|
int ret;
|
2017-05-31 16:59:28 +00:00
|
|
|
u32 cpu_capacity;
|
|
|
|
|
|
|
|
if (cap_parsing_failed)
|
2017-06-23 09:25:32 +00:00
|
|
|
return false;
|
2017-05-31 16:59:28 +00:00
|
|
|
|
2017-06-23 09:25:30 +00:00
|
|
|
ret = of_property_read_u32(cpu_node, "capacity-dmips-mhz",
|
2017-05-31 16:59:28 +00:00
|
|
|
&cpu_capacity);
|
|
|
|
if (!ret) {
|
|
|
|
if (!raw_capacity) {
|
|
|
|
raw_capacity = kcalloc(num_possible_cpus(),
|
|
|
|
sizeof(*raw_capacity),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!raw_capacity) {
|
|
|
|
cap_parsing_failed = true;
|
2017-06-23 09:25:32 +00:00
|
|
|
return false;
|
2017-05-31 16:59:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
raw_capacity[cpu] = cpu_capacity;
|
2017-07-18 21:42:49 +00:00
|
|
|
pr_debug("cpu_capacity: %pOF cpu_capacity=%u (raw)\n",
|
|
|
|
cpu_node, raw_capacity[cpu]);
|
2020-01-13 03:48:15 +00:00
|
|
|
|
|
|
|
/*
|
2023-12-11 10:48:49 +00:00
|
|
|
* Update capacity_freq_ref for calculating early boot CPU capacities.
|
2020-01-13 03:48:15 +00:00
|
|
|
* For non-clk CPU DVFS mechanism, there's no way to get the
|
|
|
|
* frequency value now, assuming they are running at the same
|
2023-12-11 10:48:49 +00:00
|
|
|
* frequency (by keeping the initial capacity_freq_ref value).
|
2020-01-13 03:48:15 +00:00
|
|
|
*/
|
|
|
|
cpu_clk = of_clk_get(cpu_node, 0);
|
2020-03-17 06:33:08 +00:00
|
|
|
if (!PTR_ERR_OR_ZERO(cpu_clk)) {
|
2023-12-11 10:48:49 +00:00
|
|
|
per_cpu(capacity_freq_ref, cpu) =
|
|
|
|
clk_get_rate(cpu_clk) / HZ_PER_KHZ;
|
2020-03-17 06:33:08 +00:00
|
|
|
clk_put(cpu_clk);
|
|
|
|
}
|
2017-05-31 16:59:28 +00:00
|
|
|
} else {
|
|
|
|
if (raw_capacity) {
|
2017-07-18 21:42:49 +00:00
|
|
|
pr_err("cpu_capacity: missing %pOF raw capacity\n",
|
|
|
|
cpu_node);
|
2017-05-31 16:59:28 +00:00
|
|
|
pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n");
|
|
|
|
}
|
|
|
|
cap_parsing_failed = true;
|
2017-06-23 09:25:33 +00:00
|
|
|
free_raw_capacity();
|
2017-05-31 16:59:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return !ret;
|
|
|
|
}
|
|
|
|
|
2022-03-10 14:54:50 +00:00
|
|
|
#ifdef CONFIG_ACPI_CPPC_LIB
|
|
|
|
#include <acpi/cppc_acpi.h>
|
|
|
|
|
|
|
|
void topology_init_cpu_capacity_cppc(void)
|
|
|
|
{
|
2023-12-11 10:48:54 +00:00
|
|
|
u64 capacity, capacity_scale = 0;
|
2022-03-10 14:54:50 +00:00
|
|
|
struct cppc_perf_caps perf_caps;
|
|
|
|
int cpu;
|
|
|
|
|
2022-08-14 16:35:48 +00:00
|
|
|
if (likely(!acpi_cpc_valid()))
|
2022-03-10 14:54:50 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
raw_capacity = kcalloc(num_possible_cpus(), sizeof(*raw_capacity),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!raw_capacity)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
if (!cppc_get_perf_caps(cpu, &perf_caps) &&
|
|
|
|
(perf_caps.highest_perf >= perf_caps.nominal_perf) &&
|
|
|
|
(perf_caps.highest_perf >= perf_caps.lowest_perf)) {
|
|
|
|
raw_capacity[cpu] = perf_caps.highest_perf;
|
2023-12-11 10:48:54 +00:00
|
|
|
capacity_scale = max_t(u64, capacity_scale, raw_capacity[cpu]);
|
|
|
|
|
|
|
|
per_cpu(capacity_freq_ref, cpu) = cppc_perf_to_khz(&perf_caps, raw_capacity[cpu]);
|
|
|
|
|
2022-03-10 14:54:50 +00:00
|
|
|
pr_debug("cpu_capacity: CPU%d cpu_capacity=%u (raw).\n",
|
|
|
|
cpu, raw_capacity[cpu]);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
pr_err("cpu_capacity: CPU%d missing/invalid highest performance.\n", cpu);
|
|
|
|
pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n");
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
2023-12-11 10:48:54 +00:00
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
capacity = raw_capacity[cpu];
|
|
|
|
capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT,
|
|
|
|
capacity_scale);
|
|
|
|
topology_set_cpu_scale(cpu, capacity);
|
|
|
|
pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
|
|
|
|
cpu, topology_get_cpu_scale(cpu));
|
|
|
|
}
|
|
|
|
|
2022-03-10 14:54:50 +00:00
|
|
|
schedule_work(&update_topology_flags_work);
|
|
|
|
pr_debug("cpu_capacity: cpu_capacity initialization done\n");
|
|
|
|
|
|
|
|
exit:
|
|
|
|
free_raw_capacity();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-05-31 16:59:28 +00:00
|
|
|
#ifdef CONFIG_CPU_FREQ
|
2018-02-13 02:06:40 +00:00
|
|
|
static cpumask_var_t cpus_to_visit;
|
|
|
|
static void parsing_done_workfn(struct work_struct *work);
|
|
|
|
static DECLARE_WORK(parsing_done_work, parsing_done_workfn);
|
2017-05-31 16:59:28 +00:00
|
|
|
|
2018-02-13 02:06:40 +00:00
|
|
|
static int
|
2017-05-31 16:59:28 +00:00
|
|
|
init_cpu_capacity_callback(struct notifier_block *nb,
|
|
|
|
unsigned long val,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
struct cpufreq_policy *policy = data;
|
|
|
|
int cpu;
|
|
|
|
|
2017-06-23 09:25:34 +00:00
|
|
|
if (!raw_capacity)
|
2017-05-31 16:59:28 +00:00
|
|
|
return 0;
|
|
|
|
|
2019-07-23 06:14:06 +00:00
|
|
|
if (val != CPUFREQ_CREATE_POLICY)
|
2017-06-23 09:25:31 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] (to_visit=%*pbl)\n",
|
|
|
|
cpumask_pr_args(policy->related_cpus),
|
|
|
|
cpumask_pr_args(cpus_to_visit));
|
|
|
|
|
|
|
|
cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
|
|
|
|
|
2020-01-13 03:48:15 +00:00
|
|
|
for_each_cpu(cpu, policy->related_cpus)
|
2023-12-11 10:48:49 +00:00
|
|
|
per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq;
|
2017-06-23 09:25:31 +00:00
|
|
|
|
|
|
|
if (cpumask_empty(cpus_to_visit)) {
|
|
|
|
topology_normalize_cpu_scale();
|
2018-07-20 13:32:32 +00:00
|
|
|
schedule_work(&update_topology_flags_work);
|
2017-06-23 09:25:33 +00:00
|
|
|
free_raw_capacity();
|
2017-06-23 09:25:31 +00:00
|
|
|
pr_debug("cpu_capacity: parsing done\n");
|
|
|
|
schedule_work(&parsing_done_work);
|
|
|
|
}
|
|
|
|
|
2017-05-31 16:59:28 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-02-13 02:06:40 +00:00
|
|
|
static struct notifier_block init_cpu_capacity_notifier = {
|
2017-05-31 16:59:28 +00:00
|
|
|
.notifier_call = init_cpu_capacity_callback,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init register_cpufreq_notifier(void)
|
|
|
|
{
|
2017-09-26 16:41:06 +00:00
|
|
|
int ret;
|
|
|
|
|
2017-05-31 16:59:28 +00:00
|
|
|
/*
|
2022-03-10 14:54:50 +00:00
|
|
|
* On ACPI-based systems skip registering cpufreq notifier as cpufreq
|
|
|
|
* information is not needed for cpu capacity initialization.
|
2017-05-31 16:59:28 +00:00
|
|
|
*/
|
2017-05-31 16:59:29 +00:00
|
|
|
if (!acpi_disabled || !raw_capacity)
|
2017-05-31 16:59:28 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2019-05-27 12:27:03 +00:00
|
|
|
if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL))
|
2017-05-31 16:59:28 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
cpumask_copy(cpus_to_visit, cpu_possible_mask);
|
|
|
|
|
2017-09-26 16:41:06 +00:00
|
|
|
ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
|
|
|
|
CPUFREQ_POLICY_NOTIFIER);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
free_cpumask_var(cpus_to_visit);
|
|
|
|
|
|
|
|
return ret;
|
2017-05-31 16:59:28 +00:00
|
|
|
}
|
|
|
|
core_initcall(register_cpufreq_notifier);
|
|
|
|
|
2018-02-13 02:06:40 +00:00
|
|
|
static void parsing_done_workfn(struct work_struct *work)
|
2017-05-31 16:59:28 +00:00
|
|
|
{
|
|
|
|
cpufreq_unregister_notifier(&init_cpu_capacity_notifier,
|
|
|
|
CPUFREQ_POLICY_NOTIFIER);
|
2017-09-26 16:41:06 +00:00
|
|
|
free_cpumask_var(cpus_to_visit);
|
2017-05-31 16:59:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
core_initcall(free_raw_capacity);
|
|
|
|
#endif
|
2019-06-27 19:52:58 +00:00
|
|
|
|
|
|
|
#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
|
2020-01-17 01:52:52 +00:00
|
|
|
/*
|
|
|
|
* This function returns the logic cpu number of the node.
|
|
|
|
* There are basically three kinds of return values:
|
|
|
|
* (1) logic cpu number which is > 0.
|
|
|
|
* (2) -ENODEV when the device tree(DT) node is valid and found in the DT but
|
|
|
|
* there is no possible logical CPU in the kernel to match. This happens
|
|
|
|
* when CONFIG_NR_CPUS is configure to be smaller than the number of
|
|
|
|
* CPU nodes in DT. We need to just ignore this case.
|
|
|
|
* (3) -1 if the node does not exist in the device tree
|
|
|
|
*/
|
2019-06-27 19:52:58 +00:00
|
|
|
static int __init get_cpu_for_node(struct device_node *node)
|
|
|
|
{
|
|
|
|
struct device_node *cpu_node;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
cpu_node = of_parse_phandle(node, "cpu", 0);
|
|
|
|
if (!cpu_node)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
cpu = of_cpu_node_to_id(cpu_node);
|
|
|
|
if (cpu >= 0)
|
|
|
|
topology_parse_cpu_capacity(cpu_node, cpu);
|
|
|
|
else
|
2020-01-17 01:52:52 +00:00
|
|
|
pr_info("CPU node for %pOF exist but the possible cpu range is :%*pbl\n",
|
|
|
|
cpu_node, cpumask_pr_args(cpu_possible_mask));
|
2019-06-27 19:52:58 +00:00
|
|
|
|
|
|
|
of_node_put(cpu_node);
|
|
|
|
return cpu;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init parse_core(struct device_node *core, int package_id,
|
2022-07-04 10:16:02 +00:00
|
|
|
int cluster_id, int core_id)
|
2019-06-27 19:52:58 +00:00
|
|
|
{
|
2020-03-04 03:54:52 +00:00
|
|
|
char name[20];
|
2019-06-27 19:52:58 +00:00
|
|
|
bool leaf = true;
|
|
|
|
int i = 0;
|
|
|
|
int cpu;
|
|
|
|
struct device_node *t;
|
|
|
|
|
|
|
|
do {
|
|
|
|
snprintf(name, sizeof(name), "thread%d", i);
|
|
|
|
t = of_get_child_by_name(core, name);
|
|
|
|
if (t) {
|
|
|
|
leaf = false;
|
|
|
|
cpu = get_cpu_for_node(t);
|
|
|
|
if (cpu >= 0) {
|
|
|
|
cpu_topology[cpu].package_id = package_id;
|
2022-07-04 10:16:02 +00:00
|
|
|
cpu_topology[cpu].cluster_id = cluster_id;
|
2019-06-27 19:52:58 +00:00
|
|
|
cpu_topology[cpu].core_id = core_id;
|
|
|
|
cpu_topology[cpu].thread_id = i;
|
2020-01-17 01:52:52 +00:00
|
|
|
} else if (cpu != -ENODEV) {
|
|
|
|
pr_err("%pOF: Can't get CPU for thread\n", t);
|
2019-06-27 19:52:58 +00:00
|
|
|
of_node_put(t);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
of_node_put(t);
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
} while (t);
|
|
|
|
|
|
|
|
cpu = get_cpu_for_node(core);
|
|
|
|
if (cpu >= 0) {
|
|
|
|
if (!leaf) {
|
|
|
|
pr_err("%pOF: Core has both threads and CPU\n",
|
|
|
|
core);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
cpu_topology[cpu].package_id = package_id;
|
2022-07-04 10:16:02 +00:00
|
|
|
cpu_topology[cpu].cluster_id = cluster_id;
|
2019-06-27 19:52:58 +00:00
|
|
|
cpu_topology[cpu].core_id = core_id;
|
2020-01-17 01:52:52 +00:00
|
|
|
} else if (leaf && cpu != -ENODEV) {
|
2019-06-27 19:52:58 +00:00
|
|
|
pr_err("%pOF: Can't get CPU for leaf core\n", core);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-07-04 10:16:03 +00:00
|
|
|
static int __init parse_cluster(struct device_node *cluster, int package_id,
|
|
|
|
int cluster_id, int depth)
|
2019-06-27 19:52:58 +00:00
|
|
|
{
|
2020-03-04 03:54:52 +00:00
|
|
|
char name[20];
|
2019-06-27 19:52:58 +00:00
|
|
|
bool leaf = true;
|
|
|
|
bool has_cores = false;
|
|
|
|
struct device_node *c;
|
|
|
|
int core_id = 0;
|
|
|
|
int i, ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First check for child clusters; we currently ignore any
|
|
|
|
* information about the nesting of clusters and present the
|
|
|
|
* scheduler with a flat list of them.
|
|
|
|
*/
|
|
|
|
i = 0;
|
|
|
|
do {
|
|
|
|
snprintf(name, sizeof(name), "cluster%d", i);
|
|
|
|
c = of_get_child_by_name(cluster, name);
|
|
|
|
if (c) {
|
|
|
|
leaf = false;
|
2022-07-04 10:16:03 +00:00
|
|
|
ret = parse_cluster(c, package_id, i, depth + 1);
|
2022-07-04 10:16:04 +00:00
|
|
|
if (depth > 0)
|
|
|
|
pr_warn("Topology for clusters of clusters not yet supported\n");
|
2019-06-27 19:52:58 +00:00
|
|
|
of_node_put(c);
|
|
|
|
if (ret != 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
} while (c);
|
|
|
|
|
|
|
|
/* Now check for cores */
|
|
|
|
i = 0;
|
|
|
|
do {
|
|
|
|
snprintf(name, sizeof(name), "core%d", i);
|
|
|
|
c = of_get_child_by_name(cluster, name);
|
|
|
|
if (c) {
|
|
|
|
has_cores = true;
|
|
|
|
|
|
|
|
if (depth == 0) {
|
|
|
|
pr_err("%pOF: cpu-map children should be clusters\n",
|
|
|
|
c);
|
|
|
|
of_node_put(c);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (leaf) {
|
2022-07-04 10:16:03 +00:00
|
|
|
ret = parse_core(c, package_id, cluster_id,
|
|
|
|
core_id++);
|
2019-06-27 19:52:58 +00:00
|
|
|
} else {
|
|
|
|
pr_err("%pOF: Non-leaf cluster with core %s\n",
|
|
|
|
cluster, name);
|
|
|
|
ret = -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
of_node_put(c);
|
|
|
|
if (ret != 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
} while (c);
|
|
|
|
|
|
|
|
if (leaf && !has_cores)
|
|
|
|
pr_warn("%pOF: empty cluster\n", cluster);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-07-04 10:16:03 +00:00
|
|
|
static int __init parse_socket(struct device_node *socket)
|
|
|
|
{
|
|
|
|
char name[20];
|
|
|
|
struct device_node *c;
|
|
|
|
bool has_socket = false;
|
|
|
|
int package_id = 0, ret;
|
|
|
|
|
|
|
|
do {
|
|
|
|
snprintf(name, sizeof(name), "socket%d", package_id);
|
|
|
|
c = of_get_child_by_name(socket, name);
|
|
|
|
if (c) {
|
|
|
|
has_socket = true;
|
|
|
|
ret = parse_cluster(c, package_id, -1, 0);
|
|
|
|
of_node_put(c);
|
|
|
|
if (ret != 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
package_id++;
|
|
|
|
} while (c);
|
|
|
|
|
|
|
|
if (!has_socket)
|
|
|
|
ret = parse_cluster(socket, 0, -1, 0);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-06-27 19:52:58 +00:00
|
|
|
static int __init parse_dt_topology(void)
|
|
|
|
{
|
|
|
|
struct device_node *cn, *map;
|
|
|
|
int ret = 0;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
cn = of_find_node_by_path("/cpus");
|
|
|
|
if (!cn) {
|
|
|
|
pr_err("No CPU information found in DT\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When topology is provided cpu-map is essentially a root
|
|
|
|
* cluster with restricted subnodes.
|
|
|
|
*/
|
|
|
|
map = of_get_child_by_name(cn, "cpu-map");
|
|
|
|
if (!map)
|
|
|
|
goto out;
|
|
|
|
|
2022-07-04 10:16:03 +00:00
|
|
|
ret = parse_socket(map);
|
2019-06-27 19:52:58 +00:00
|
|
|
if (ret != 0)
|
|
|
|
goto out_map;
|
|
|
|
|
|
|
|
topology_normalize_cpu_scale();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check that all cores are in the topology; the SMP code will
|
|
|
|
* only mark cores described in the DT as possible.
|
|
|
|
*/
|
|
|
|
for_each_possible_cpu(cpu)
|
2022-07-04 10:15:59 +00:00
|
|
|
if (cpu_topology[cpu].package_id < 0) {
|
2019-06-27 19:52:58 +00:00
|
|
|
ret = -EINVAL;
|
2022-07-04 10:15:59 +00:00
|
|
|
break;
|
|
|
|
}
|
2019-06-27 19:52:58 +00:00
|
|
|
|
|
|
|
out_map:
|
|
|
|
of_node_put(map);
|
|
|
|
out:
|
|
|
|
of_node_put(cn);
|
|
|
|
return ret;
|
|
|
|
}
|
2019-06-27 19:52:59 +00:00
|
|
|
#endif
|
2019-06-27 19:52:58 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* cpu topology table
|
|
|
|
*/
|
|
|
|
struct cpu_topology cpu_topology[NR_CPUS];
|
|
|
|
EXPORT_SYMBOL_GPL(cpu_topology);
|
|
|
|
|
|
|
|
const struct cpumask *cpu_coregroup_mask(int cpu)
|
|
|
|
{
|
|
|
|
const cpumask_t *core_mask = cpumask_of_node(cpu_to_node(cpu));
|
|
|
|
|
|
|
|
/* Find the smaller of NUMA, core or LLC siblings */
|
|
|
|
if (cpumask_subset(&cpu_topology[cpu].core_sibling, core_mask)) {
|
|
|
|
/* not numa in package, lets use the package siblings */
|
|
|
|
core_mask = &cpu_topology[cpu].core_sibling;
|
|
|
|
}
|
2022-07-04 10:15:54 +00:00
|
|
|
|
|
|
|
if (last_level_cache_is_valid(cpu)) {
|
2019-06-27 19:52:58 +00:00
|
|
|
if (cpumask_subset(&cpu_topology[cpu].llc_sibling, core_mask))
|
|
|
|
core_mask = &cpu_topology[cpu].llc_sibling;
|
|
|
|
}
|
|
|
|
|
topology: make core_mask include at least cluster_siblings
Ampere Altra defines CPU clusters in the ACPI PPTT. They share a Snoop
Control Unit, but have no shared CPU-side last level cache.
cpu_coregroup_mask() will return a cpumask with weight 1, while
cpu_clustergroup_mask() will return a cpumask with weight 2.
As a result, build_sched_domain() will BUG() once per CPU with:
BUG: arch topology borken
the CLS domain not a subset of the MC domain
The MC level cpumask is then extended to that of the CLS child, and is
later removed entirely as redundant. This sched domain topology is an
improvement over previous topologies, or those built without
SCHED_CLUSTER, particularly for certain latency sensitive workloads.
With the current scheduler model and heuristics, this is a desirable
default topology for Ampere Altra and Altra Max system.
Rather than create a custom sched domains topology structure and
introduce new logic in arch/arm64 to detect these systems, update the
core_mask so coregroup is never a subset of clustergroup, extending it
to cluster_siblings if necessary. Only do this if CONFIG_SCHED_CLUSTER
is enabled to avoid also changing the topology (MC) when
CONFIG_SCHED_CLUSTER is disabled.
This has the added benefit over a custom topology of working for both
symmetric and asymmetric topologies. It does not address systems where
the CLUSTER topology is above a populated MC topology, but these are not
considered today and can be addressed separately if and when they
appear.
The final sched domain topology for a 2 socket Ampere Altra system is
unchanged with or without CONFIG_SCHED_CLUSTER, and the BUG is avoided:
For CPU0:
CONFIG_SCHED_CLUSTER=y
CLS [0-1]
DIE [0-79]
NUMA [0-159]
CONFIG_SCHED_CLUSTER is not set
DIE [0-79]
NUMA [0-159]
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: D. Scott Phillips <scott@os.amperecomputing.com>
Cc: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Cc: <stable@vger.kernel.org> # 5.16.x
Suggested-by: Barry Song <song.bao.hua@hisilicon.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Darren Hart <darren@os.amperecomputing.com>
Link: https://lore.kernel.org/r/c8fe9fce7c86ed56b4c455b8c902982dc2303868.1649696956.git.darren@os.amperecomputing.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2022-04-11 20:53:34 +00:00
|
|
|
/*
|
|
|
|
* For systems with no shared cpu-side LLC but with clusters defined,
|
|
|
|
* extend core_mask to cluster_siblings. The sched domain builder will
|
|
|
|
* then remove MC as redundant with CLS if SCHED_CLUSTER is enabled.
|
|
|
|
*/
|
|
|
|
if (IS_ENABLED(CONFIG_SCHED_CLUSTER) &&
|
|
|
|
cpumask_subset(core_mask, &cpu_topology[cpu].cluster_sibling))
|
|
|
|
core_mask = &cpu_topology[cpu].cluster_sibling;
|
|
|
|
|
2019-06-27 19:52:58 +00:00
|
|
|
return core_mask;
|
|
|
|
}
|
|
|
|
|
2021-09-24 08:51:02 +00:00
|
|
|
const struct cpumask *cpu_clustergroup_mask(int cpu)
|
|
|
|
{
|
2022-07-04 10:16:01 +00:00
|
|
|
/*
|
|
|
|
* Forbid cpu_clustergroup_mask() to span more or the same CPUs as
|
|
|
|
* cpu_coregroup_mask().
|
|
|
|
*/
|
|
|
|
if (cpumask_subset(cpu_coregroup_mask(cpu),
|
|
|
|
&cpu_topology[cpu].cluster_sibling))
|
2022-09-05 12:26:15 +00:00
|
|
|
return topology_sibling_cpumask(cpu);
|
2022-07-04 10:16:01 +00:00
|
|
|
|
2021-09-24 08:51:02 +00:00
|
|
|
return &cpu_topology[cpu].cluster_sibling;
|
|
|
|
}
|
|
|
|
|
2019-06-27 19:52:58 +00:00
|
|
|
void update_siblings_masks(unsigned int cpuid)
|
|
|
|
{
|
|
|
|
struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
|
2022-07-20 12:55:40 +00:00
|
|
|
int cpu, ret;
|
|
|
|
|
|
|
|
ret = detect_cache_attributes(cpuid);
|
2022-08-05 23:07:36 +00:00
|
|
|
if (ret && ret != -ENOENT)
|
arch_topology: Build cacheinfo from primary CPU
commit 3fcbf1c77d08 ("arch_topology: Fix cache attributes detection
in the CPU hotplug path")
adds a call to detect_cache_attributes() to populate the cacheinfo
before updating the siblings mask. detect_cache_attributes() allocates
memory and can take the PPTT mutex (on ACPI platforms). On PREEMPT_RT
kernels, on secondary CPUs, this triggers a:
'BUG: sleeping function called from invalid context' [1]
as the code is executed with preemption and interrupts disabled.
The primary CPU was previously storing the cache information using
the now removed (struct cpu_topology).llc_id:
commit 5b8dc787ce4a ("arch_topology: Drop LLC identifier stash from
the CPU topology")
allocate_cache_info() tries to build the cacheinfo from the primary
CPU prior secondary CPUs boot, if the DT/ACPI description
contains cache information.
If allocate_cache_info() fails, then fallback to the current state
for the cacheinfo allocation. [1] will be triggered in such case.
When unplugging a CPU, the cacheinfo memory cannot be freed. If it
was, then the memory would be allocated early by the re-plugged
CPU and would trigger [1].
Note that populate_cache_leaves() might be called multiple times
due to populate_leaves being moved up. This is required since
detect_cache_attributes() might be called with per_cpu_cacheinfo(cpu)
being allocated but not populated.
[1]:
| BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
| in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 0, name: swapper/111
| preempt_count: 1, expected: 0
| RCU nest depth: 1, expected: 1
| 3 locks held by swapper/111/0:
| #0: (&pcp->lock){+.+.}-{3:3}, at: get_page_from_freelist+0x218/0x12c8
| #1: (rcu_read_lock){....}-{1:3}, at: rt_spin_trylock+0x48/0xf0
| #2: (&zone->lock){+.+.}-{3:3}, at: rmqueue_bulk+0x64/0xa80
| irq event stamp: 0
| hardirqs last enabled at (0): 0x0
| hardirqs last disabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last enabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last disabled at (0): 0x0
| Preemption disabled at:
| migrate_enable+0x30/0x130
| CPU: 111 PID: 0 Comm: swapper/111 Tainted: G W 6.0.0-rc4-rt6-[...]
| Call trace:
| __kmalloc+0xbc/0x1e8
| detect_cache_attributes+0x2d4/0x5f0
| update_siblings_masks+0x30/0x368
| store_cpu_topology+0x78/0xb8
| secondary_start_kernel+0xd0/0x198
| __secondary_switched+0xb0/0xb4
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Link: https://lore.kernel.org/r/20230104183033.755668-7-pierre.gondois@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
2023-01-04 18:30:29 +00:00
|
|
|
pr_info("Early cacheinfo allocation failed, ret = %d\n", ret);
|
2019-06-27 19:52:58 +00:00
|
|
|
|
|
|
|
/* update core and thread sibling masks */
|
|
|
|
for_each_online_cpu(cpu) {
|
|
|
|
cpu_topo = &cpu_topology[cpu];
|
|
|
|
|
2022-07-04 10:15:54 +00:00
|
|
|
if (last_level_cache_is_shared(cpu, cpuid)) {
|
2019-06-27 19:52:58 +00:00
|
|
|
cpumask_set_cpu(cpu, &cpuid_topo->llc_sibling);
|
|
|
|
cpumask_set_cpu(cpuid, &cpu_topo->llc_sibling);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cpuid_topo->package_id != cpu_topo->package_id)
|
|
|
|
continue;
|
|
|
|
|
2022-07-04 10:15:57 +00:00
|
|
|
cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
|
|
|
|
cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
|
|
|
|
|
|
|
|
if (cpuid_topo->cluster_id != cpu_topo->cluster_id)
|
|
|
|
continue;
|
|
|
|
|
2022-07-04 10:15:58 +00:00
|
|
|
if (cpuid_topo->cluster_id >= 0) {
|
2021-09-24 08:51:02 +00:00
|
|
|
cpumask_set_cpu(cpu, &cpuid_topo->cluster_sibling);
|
|
|
|
cpumask_set_cpu(cpuid, &cpu_topo->cluster_sibling);
|
|
|
|
}
|
|
|
|
|
2019-06-27 19:52:58 +00:00
|
|
|
if (cpuid_topo->core_id != cpu_topo->core_id)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
|
|
|
|
cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void clear_cpu_topology(int cpu)
|
|
|
|
{
|
|
|
|
struct cpu_topology *cpu_topo = &cpu_topology[cpu];
|
|
|
|
|
|
|
|
cpumask_clear(&cpu_topo->llc_sibling);
|
|
|
|
cpumask_set_cpu(cpu, &cpu_topo->llc_sibling);
|
|
|
|
|
2021-09-24 08:51:02 +00:00
|
|
|
cpumask_clear(&cpu_topo->cluster_sibling);
|
|
|
|
cpumask_set_cpu(cpu, &cpu_topo->cluster_sibling);
|
|
|
|
|
2019-06-27 19:52:58 +00:00
|
|
|
cpumask_clear(&cpu_topo->core_sibling);
|
|
|
|
cpumask_set_cpu(cpu, &cpu_topo->core_sibling);
|
|
|
|
cpumask_clear(&cpu_topo->thread_sibling);
|
|
|
|
cpumask_set_cpu(cpu, &cpu_topo->thread_sibling);
|
|
|
|
}
|
|
|
|
|
2019-06-27 19:52:59 +00:00
|
|
|
void __init reset_cpu_topology(void)
|
2019-06-27 19:52:58 +00:00
|
|
|
{
|
|
|
|
unsigned int cpu;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
struct cpu_topology *cpu_topo = &cpu_topology[cpu];
|
|
|
|
|
|
|
|
cpu_topo->thread_id = -1;
|
|
|
|
cpu_topo->core_id = -1;
|
2021-09-24 08:51:02 +00:00
|
|
|
cpu_topo->cluster_id = -1;
|
2019-06-27 19:52:58 +00:00
|
|
|
cpu_topo->package_id = -1;
|
|
|
|
|
|
|
|
clear_cpu_topology(cpu);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void remove_cpu_topology(unsigned int cpu)
|
|
|
|
{
|
|
|
|
int sibling;
|
|
|
|
|
|
|
|
for_each_cpu(sibling, topology_core_cpumask(cpu))
|
|
|
|
cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
|
|
|
|
for_each_cpu(sibling, topology_sibling_cpumask(cpu))
|
|
|
|
cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
|
2021-11-10 09:58:56 +00:00
|
|
|
for_each_cpu(sibling, topology_cluster_cpumask(cpu))
|
|
|
|
cpumask_clear_cpu(cpu, topology_cluster_cpumask(sibling));
|
2019-06-27 19:52:58 +00:00
|
|
|
for_each_cpu(sibling, topology_llc_cpumask(cpu))
|
|
|
|
cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling));
|
|
|
|
|
|
|
|
clear_cpu_topology(cpu);
|
|
|
|
}
|
|
|
|
|
|
|
|
__weak int __init parse_acpi_topology(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-06-27 19:52:59 +00:00
|
|
|
#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
|
2019-06-27 19:52:58 +00:00
|
|
|
void __init init_cpu_topology(void)
|
|
|
|
{
|
arch_topology: Build cacheinfo from primary CPU
commit 3fcbf1c77d08 ("arch_topology: Fix cache attributes detection
in the CPU hotplug path")
adds a call to detect_cache_attributes() to populate the cacheinfo
before updating the siblings mask. detect_cache_attributes() allocates
memory and can take the PPTT mutex (on ACPI platforms). On PREEMPT_RT
kernels, on secondary CPUs, this triggers a:
'BUG: sleeping function called from invalid context' [1]
as the code is executed with preemption and interrupts disabled.
The primary CPU was previously storing the cache information using
the now removed (struct cpu_topology).llc_id:
commit 5b8dc787ce4a ("arch_topology: Drop LLC identifier stash from
the CPU topology")
allocate_cache_info() tries to build the cacheinfo from the primary
CPU prior secondary CPUs boot, if the DT/ACPI description
contains cache information.
If allocate_cache_info() fails, then fallback to the current state
for the cacheinfo allocation. [1] will be triggered in such case.
When unplugging a CPU, the cacheinfo memory cannot be freed. If it
was, then the memory would be allocated early by the re-plugged
CPU and would trigger [1].
Note that populate_cache_leaves() might be called multiple times
due to populate_leaves being moved up. This is required since
detect_cache_attributes() might be called with per_cpu_cacheinfo(cpu)
being allocated but not populated.
[1]:
| BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
| in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 0, name: swapper/111
| preempt_count: 1, expected: 0
| RCU nest depth: 1, expected: 1
| 3 locks held by swapper/111/0:
| #0: (&pcp->lock){+.+.}-{3:3}, at: get_page_from_freelist+0x218/0x12c8
| #1: (rcu_read_lock){....}-{1:3}, at: rt_spin_trylock+0x48/0xf0
| #2: (&zone->lock){+.+.}-{3:3}, at: rmqueue_bulk+0x64/0xa80
| irq event stamp: 0
| hardirqs last enabled at (0): 0x0
| hardirqs last disabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last enabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last disabled at (0): 0x0
| Preemption disabled at:
| migrate_enable+0x30/0x130
| CPU: 111 PID: 0 Comm: swapper/111 Tainted: G W 6.0.0-rc4-rt6-[...]
| Call trace:
| __kmalloc+0xbc/0x1e8
| detect_cache_attributes+0x2d4/0x5f0
| update_siblings_masks+0x30/0x368
| store_cpu_topology+0x78/0xb8
| secondary_start_kernel+0xd0/0x198
| __secondary_switched+0xb0/0xb4
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Link: https://lore.kernel.org/r/20230104183033.755668-7-pierre.gondois@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
2023-01-04 18:30:29 +00:00
|
|
|
int cpu, ret;
|
2022-07-04 10:15:53 +00:00
|
|
|
|
2019-06-27 19:52:58 +00:00
|
|
|
reset_cpu_topology();
|
2022-07-04 10:15:53 +00:00
|
|
|
ret = parse_acpi_topology();
|
|
|
|
if (!ret)
|
|
|
|
ret = of_have_populated_dt() && parse_dt_topology();
|
2019-06-27 19:52:58 +00:00
|
|
|
|
2022-07-04 10:15:53 +00:00
|
|
|
if (ret) {
|
|
|
|
/*
|
|
|
|
* Discard anything that was parsed if we hit an error so we
|
2023-04-12 18:57:59 +00:00
|
|
|
* don't use partial information. But do not return yet to give
|
|
|
|
* arch-specific early cache level detection a chance to run.
|
2022-07-04 10:15:53 +00:00
|
|
|
*/
|
2019-06-27 19:52:58 +00:00
|
|
|
reset_cpu_topology();
|
2022-07-04 10:15:53 +00:00
|
|
|
}
|
arch_topology: Build cacheinfo from primary CPU
commit 3fcbf1c77d08 ("arch_topology: Fix cache attributes detection
in the CPU hotplug path")
adds a call to detect_cache_attributes() to populate the cacheinfo
before updating the siblings mask. detect_cache_attributes() allocates
memory and can take the PPTT mutex (on ACPI platforms). On PREEMPT_RT
kernels, on secondary CPUs, this triggers a:
'BUG: sleeping function called from invalid context' [1]
as the code is executed with preemption and interrupts disabled.
The primary CPU was previously storing the cache information using
the now removed (struct cpu_topology).llc_id:
commit 5b8dc787ce4a ("arch_topology: Drop LLC identifier stash from
the CPU topology")
allocate_cache_info() tries to build the cacheinfo from the primary
CPU prior secondary CPUs boot, if the DT/ACPI description
contains cache information.
If allocate_cache_info() fails, then fallback to the current state
for the cacheinfo allocation. [1] will be triggered in such case.
When unplugging a CPU, the cacheinfo memory cannot be freed. If it
was, then the memory would be allocated early by the re-plugged
CPU and would trigger [1].
Note that populate_cache_leaves() might be called multiple times
due to populate_leaves being moved up. This is required since
detect_cache_attributes() might be called with per_cpu_cacheinfo(cpu)
being allocated but not populated.
[1]:
| BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
| in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 0, name: swapper/111
| preempt_count: 1, expected: 0
| RCU nest depth: 1, expected: 1
| 3 locks held by swapper/111/0:
| #0: (&pcp->lock){+.+.}-{3:3}, at: get_page_from_freelist+0x218/0x12c8
| #1: (rcu_read_lock){....}-{1:3}, at: rt_spin_trylock+0x48/0xf0
| #2: (&zone->lock){+.+.}-{3:3}, at: rmqueue_bulk+0x64/0xa80
| irq event stamp: 0
| hardirqs last enabled at (0): 0x0
| hardirqs last disabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last enabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last disabled at (0): 0x0
| Preemption disabled at:
| migrate_enable+0x30/0x130
| CPU: 111 PID: 0 Comm: swapper/111 Tainted: G W 6.0.0-rc4-rt6-[...]
| Call trace:
| __kmalloc+0xbc/0x1e8
| detect_cache_attributes+0x2d4/0x5f0
| update_siblings_masks+0x30/0x368
| store_cpu_topology+0x78/0xb8
| secondary_start_kernel+0xd0/0x198
| __secondary_switched+0xb0/0xb4
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Link: https://lore.kernel.org/r/20230104183033.755668-7-pierre.gondois@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
2023-01-04 18:30:29 +00:00
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
ret = fetch_cache_info(cpu);
|
2023-04-14 08:14:51 +00:00
|
|
|
if (!ret)
|
|
|
|
continue;
|
|
|
|
else if (ret != -ENOENT)
|
arch_topology: Build cacheinfo from primary CPU
commit 3fcbf1c77d08 ("arch_topology: Fix cache attributes detection
in the CPU hotplug path")
adds a call to detect_cache_attributes() to populate the cacheinfo
before updating the siblings mask. detect_cache_attributes() allocates
memory and can take the PPTT mutex (on ACPI platforms). On PREEMPT_RT
kernels, on secondary CPUs, this triggers a:
'BUG: sleeping function called from invalid context' [1]
as the code is executed with preemption and interrupts disabled.
The primary CPU was previously storing the cache information using
the now removed (struct cpu_topology).llc_id:
commit 5b8dc787ce4a ("arch_topology: Drop LLC identifier stash from
the CPU topology")
allocate_cache_info() tries to build the cacheinfo from the primary
CPU prior secondary CPUs boot, if the DT/ACPI description
contains cache information.
If allocate_cache_info() fails, then fallback to the current state
for the cacheinfo allocation. [1] will be triggered in such case.
When unplugging a CPU, the cacheinfo memory cannot be freed. If it
was, then the memory would be allocated early by the re-plugged
CPU and would trigger [1].
Note that populate_cache_leaves() might be called multiple times
due to populate_leaves being moved up. This is required since
detect_cache_attributes() might be called with per_cpu_cacheinfo(cpu)
being allocated but not populated.
[1]:
| BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
| in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 0, name: swapper/111
| preempt_count: 1, expected: 0
| RCU nest depth: 1, expected: 1
| 3 locks held by swapper/111/0:
| #0: (&pcp->lock){+.+.}-{3:3}, at: get_page_from_freelist+0x218/0x12c8
| #1: (rcu_read_lock){....}-{1:3}, at: rt_spin_trylock+0x48/0xf0
| #2: (&zone->lock){+.+.}-{3:3}, at: rmqueue_bulk+0x64/0xa80
| irq event stamp: 0
| hardirqs last enabled at (0): 0x0
| hardirqs last disabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last enabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last disabled at (0): 0x0
| Preemption disabled at:
| migrate_enable+0x30/0x130
| CPU: 111 PID: 0 Comm: swapper/111 Tainted: G W 6.0.0-rc4-rt6-[...]
| Call trace:
| __kmalloc+0xbc/0x1e8
| detect_cache_attributes+0x2d4/0x5f0
| update_siblings_masks+0x30/0x368
| store_cpu_topology+0x78/0xb8
| secondary_start_kernel+0xd0/0x198
| __secondary_switched+0xb0/0xb4
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Link: https://lore.kernel.org/r/20230104183033.755668-7-pierre.gondois@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
2023-01-04 18:30:29 +00:00
|
|
|
pr_err("Early cacheinfo failed, ret = %d\n", ret);
|
2023-04-14 08:14:51 +00:00
|
|
|
return;
|
arch_topology: Build cacheinfo from primary CPU
commit 3fcbf1c77d08 ("arch_topology: Fix cache attributes detection
in the CPU hotplug path")
adds a call to detect_cache_attributes() to populate the cacheinfo
before updating the siblings mask. detect_cache_attributes() allocates
memory and can take the PPTT mutex (on ACPI platforms). On PREEMPT_RT
kernels, on secondary CPUs, this triggers a:
'BUG: sleeping function called from invalid context' [1]
as the code is executed with preemption and interrupts disabled.
The primary CPU was previously storing the cache information using
the now removed (struct cpu_topology).llc_id:
commit 5b8dc787ce4a ("arch_topology: Drop LLC identifier stash from
the CPU topology")
allocate_cache_info() tries to build the cacheinfo from the primary
CPU prior secondary CPUs boot, if the DT/ACPI description
contains cache information.
If allocate_cache_info() fails, then fallback to the current state
for the cacheinfo allocation. [1] will be triggered in such case.
When unplugging a CPU, the cacheinfo memory cannot be freed. If it
was, then the memory would be allocated early by the re-plugged
CPU and would trigger [1].
Note that populate_cache_leaves() might be called multiple times
due to populate_leaves being moved up. This is required since
detect_cache_attributes() might be called with per_cpu_cacheinfo(cpu)
being allocated but not populated.
[1]:
| BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
| in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 0, name: swapper/111
| preempt_count: 1, expected: 0
| RCU nest depth: 1, expected: 1
| 3 locks held by swapper/111/0:
| #0: (&pcp->lock){+.+.}-{3:3}, at: get_page_from_freelist+0x218/0x12c8
| #1: (rcu_read_lock){....}-{1:3}, at: rt_spin_trylock+0x48/0xf0
| #2: (&zone->lock){+.+.}-{3:3}, at: rmqueue_bulk+0x64/0xa80
| irq event stamp: 0
| hardirqs last enabled at (0): 0x0
| hardirqs last disabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last enabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last disabled at (0): 0x0
| Preemption disabled at:
| migrate_enable+0x30/0x130
| CPU: 111 PID: 0 Comm: swapper/111 Tainted: G W 6.0.0-rc4-rt6-[...]
| Call trace:
| __kmalloc+0xbc/0x1e8
| detect_cache_attributes+0x2d4/0x5f0
| update_siblings_masks+0x30/0x368
| store_cpu_topology+0x78/0xb8
| secondary_start_kernel+0xd0/0x198
| __secondary_switched+0xb0/0xb4
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Link: https://lore.kernel.org/r/20230104183033.755668-7-pierre.gondois@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
2023-01-04 18:30:29 +00:00
|
|
|
}
|
2019-06-27 19:52:58 +00:00
|
|
|
}
|
2022-07-15 17:51:55 +00:00
|
|
|
|
|
|
|
void store_cpu_topology(unsigned int cpuid)
|
|
|
|
{
|
|
|
|
struct cpu_topology *cpuid_topo = &cpu_topology[cpuid];
|
|
|
|
|
|
|
|
if (cpuid_topo->package_id != -1)
|
|
|
|
goto topology_populated;
|
|
|
|
|
|
|
|
cpuid_topo->thread_id = -1;
|
|
|
|
cpuid_topo->core_id = cpuid;
|
|
|
|
cpuid_topo->package_id = cpu_to_node(cpuid);
|
|
|
|
|
|
|
|
pr_debug("CPU%u: package %d core %d thread %d\n",
|
|
|
|
cpuid, cpuid_topo->package_id, cpuid_topo->core_id,
|
|
|
|
cpuid_topo->thread_id);
|
|
|
|
|
|
|
|
topology_populated:
|
|
|
|
update_siblings_masks(cpuid);
|
|
|
|
}
|
2019-06-27 19:52:58 +00:00
|
|
|
#endif
|