mirror of
https://github.com/torvalds/linux.git
synced 2024-11-10 06:01:57 +00:00
Scheduler changes for v6.10:
- Add cpufreq pressure feedback for the scheduler - Rework misfit load-balancing wrt. affinity restrictions - Clean up and simplify the code around ::overutilized and ::overload access. - Simplify sched_balance_newidle() - Bump SCHEDSTAT_VERSION to 16 due to a cleanup of CPU_MAX_IDLE_TYPES handling that changed the output. - Rework & clean up <asm/vtime.h> interactions wrt. arch_vtime_task_switch() - Reorganize, clean up and unify most of the higher level scheduler balancing function names around the sched_balance_*() prefix. - Simplify the balancing flag code (sched_balance_running) - Miscellaneous cleanups & fixes Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmZBtA0RHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1gQEw//WiCiV7zTlWShSiG/g8GTfoAvl53QTWXF 0jQ8TUcoIhxB5VeGgxVG1srYt8f505UXjH7L0MJLrbC3nOgRCg4NK57WiQEachKK HORIJHT0tMMsKIwX9D5Ovo4xYJn+j7mv7j/caB+hIlzZAbWk+zZPNWcS84p0ZS/4 appY6RIcp7+cI7bisNMGUuNZS14+WMdWoX3TgoI6ekgDZ7Ky+kQvkwGEMBXsNElO qZOj6yS/QUE4Htwz0tVfd6h5svoPM/VJMIvl0yfddPGurfNw6jEh/fjcXnLdAzZ6 9mgcosETncQbm0vfSac116lrrZIR9ygXW/yXP5S7I5dt+r+5pCrBZR2E5g7U4Ezp GjX1+6J9U6r6y12AMLRjadFOcDvxdwtszhZq4/wAcmS3B9dvupnH/w7zqY9ho3wr hTdtDHoAIzxJh7RNEHgeUC0/yQX3wJ9THzfYltDRIIjHTuvl4d5lHgsug+4Y9ClE pUIQm/XKouweQN9TZz2ULle4ZhRrR9sM9QfZYfirJ/RppmuKool4riWyQFQNHLCy mBRMjFFsTpFIOoZXU6pD4EabOpWdNrRRuND/0yg3WbDat2gBWq6jvSFv2UN1/v7i Un5jijTuN7t8yP5lY5Tyf47kQfLlA9bUx1v56KnF9mrpI87FyiDD3MiQVhDsvpGX rP96BIOrkSo= =obph -----END PGP SIGNATURE----- Merge tag 'sched-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: - Add cpufreq pressure feedback for the scheduler - Rework misfit load-balancing wrt affinity restrictions - Clean up and simplify the code around ::overutilized and ::overload access. - Simplify sched_balance_newidle() - Bump SCHEDSTAT_VERSION to 16 due to a cleanup of CPU_MAX_IDLE_TYPES handling that changed the output. - Rework & clean up <asm/vtime.h> interactions wrt arch_vtime_task_switch() - Reorganize, clean up and unify most of the higher level scheduler balancing function names around the sched_balance_*() prefix - Simplify the balancing flag code (sched_balance_running) - Miscellaneous cleanups & fixes * tag 'sched-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits) sched/pelt: Remove shift of thermal clock sched/cpufreq: Rename arch_update_thermal_pressure() => arch_update_hw_pressure() thermal/cpufreq: Remove arch_update_thermal_pressure() sched/cpufreq: Take cpufreq feedback into account cpufreq: Add a cpufreq pressure feedback for the scheduler sched/fair: Fix update of rd->sg_overutilized sched/vtime: Do not include <asm/vtime.h> header s390/irq,nmi: Include <asm/vtime.h> header directly s390/vtime: Remove unused __ARCH_HAS_VTIME_TASK_SWITCH leftover sched/vtime: Get rid of generic vtime_task_switch() implementation sched/vtime: Remove confusing arch_vtime_task_switch() declaration sched/balancing: Simplify the sg_status bitmask and use separate ->overloaded and ->overutilized flags sched/fair: Rename set_rd_overutilized_status() to set_rd_overutilized() sched/fair: Rename SG_OVERLOAD to SG_OVERLOADED sched/fair: Rename {set|get}_rd_overload() to {set|get}_rd_overloaded() sched/fair: Rename root_domain::overload to ::overloaded sched/fair: Use helper functions to access root_domain::overload sched/fair: Check root_domain::overload value before update sched/fair: Combine EAS check with root_domain::overutilized access sched/fair: Simplify the continue_balancing logic in sched_balance_newidle() ...
This commit is contained in:
commit
6e5a0c30b6
@ -5826,6 +5826,7 @@
|
||||
but is useful for debugging and performance tuning.
|
||||
|
||||
sched_thermal_decay_shift=
|
||||
[Deprecated]
|
||||
[KNL, SMP] Set a decay shift for scheduler thermal
|
||||
pressure signal. Thermal pressure signal follows the
|
||||
default decay period of other scheduler pelt
|
||||
|
@ -31,21 +31,21 @@ is treated as one entity. The load of a group is defined as the sum of the
|
||||
load of each of its member CPUs, and only when the load of a group becomes
|
||||
out of balance are tasks moved between groups.
|
||||
|
||||
In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU
|
||||
through scheduler_tick(). It raises a softirq after the next regularly scheduled
|
||||
In kernel/sched/core.c, sched_balance_trigger() is run periodically on each CPU
|
||||
through sched_tick(). It raises a softirq after the next regularly scheduled
|
||||
rebalancing event for the current runqueue has arrived. The actual load
|
||||
balancing workhorse, run_rebalance_domains()->rebalance_domains(), is then run
|
||||
balancing workhorse, sched_balance_softirq()->sched_balance_domains(), is then run
|
||||
in softirq context (SCHED_SOFTIRQ).
|
||||
|
||||
The latter function takes two arguments: the runqueue of current CPU and whether
|
||||
the CPU was idle at the time the scheduler_tick() happened and iterates over all
|
||||
the CPU was idle at the time the sched_tick() happened and iterates over all
|
||||
sched domains our CPU is on, starting from its base domain and going up the ->parent
|
||||
chain. While doing that, it checks to see if the current domain has exhausted its
|
||||
rebalance interval. If so, it runs load_balance() on that domain. It then checks
|
||||
rebalance interval. If so, it runs sched_balance_rq() on that domain. It then checks
|
||||
the parent sched_domain (if it exists), and the parent of the parent and so
|
||||
forth.
|
||||
|
||||
Initially, load_balance() finds the busiest group in the current sched domain.
|
||||
Initially, sched_balance_rq() finds the busiest group in the current sched domain.
|
||||
If it succeeds, it looks for the busiest runqueue of all the CPUs' runqueues in
|
||||
that group. If it manages to find such a runqueue, it locks both our initial
|
||||
CPU's runqueue and the newly found busiest one and starts moving tasks from it
|
||||
|
@ -2,6 +2,11 @@
|
||||
Scheduler Statistics
|
||||
====================
|
||||
|
||||
Version 16 of schedstats changed the order of definitions within
|
||||
'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES]
|
||||
columns in show_schedstat(). In particular the position of CPU_IDLE
|
||||
and __CPU_NOT_IDLE changed places. The size of the array is unchanged.
|
||||
|
||||
Version 15 of schedstats dropped counters for some sched_yield:
|
||||
yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is
|
||||
identical to version 14.
|
||||
@ -72,53 +77,53 @@ domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
|
||||
|
||||
The first field is a bit mask indicating what cpus this domain operates over.
|
||||
|
||||
The next 24 are a variety of load_balance() statistics in grouped into types
|
||||
The next 24 are a variety of sched_balance_rq() statistics in grouped into types
|
||||
of idleness (idle, busy, and newly idle):
|
||||
|
||||
1) # of times in this domain load_balance() was called when the
|
||||
1) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was idle
|
||||
2) # of times in this domain load_balance() checked but found
|
||||
2) # of times in this domain sched_balance_rq() checked but found
|
||||
the load did not require balancing when the cpu was idle
|
||||
3) # of times in this domain load_balance() tried to move one or
|
||||
3) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was idle
|
||||
4) sum of imbalances discovered (if any) with each call to
|
||||
load_balance() in this domain when the cpu was idle
|
||||
sched_balance_rq() in this domain when the cpu was idle
|
||||
5) # of times in this domain pull_task() was called when the cpu
|
||||
was idle
|
||||
6) # of times in this domain pull_task() was called even though
|
||||
the target task was cache-hot when idle
|
||||
7) # of times in this domain load_balance() was called but did
|
||||
7) # of times in this domain sched_balance_rq() was called but did
|
||||
not find a busier queue while the cpu was idle
|
||||
8) # of times in this domain a busier queue was found while the
|
||||
cpu was idle but no busier group was found
|
||||
9) # of times in this domain load_balance() was called when the
|
||||
9) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was busy
|
||||
10) # of times in this domain load_balance() checked but found the
|
||||
10) # of times in this domain sched_balance_rq() checked but found the
|
||||
load did not require balancing when busy
|
||||
11) # of times in this domain load_balance() tried to move one or
|
||||
11) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was busy
|
||||
12) sum of imbalances discovered (if any) with each call to
|
||||
load_balance() in this domain when the cpu was busy
|
||||
sched_balance_rq() in this domain when the cpu was busy
|
||||
13) # of times in this domain pull_task() was called when busy
|
||||
14) # of times in this domain pull_task() was called even though the
|
||||
target task was cache-hot when busy
|
||||
15) # of times in this domain load_balance() was called but did not
|
||||
15) # of times in this domain sched_balance_rq() was called but did not
|
||||
find a busier queue while the cpu was busy
|
||||
16) # of times in this domain a busier queue was found while the cpu
|
||||
was busy but no busier group was found
|
||||
|
||||
17) # of times in this domain load_balance() was called when the
|
||||
17) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was just becoming idle
|
||||
18) # of times in this domain load_balance() checked but found the
|
||||
18) # of times in this domain sched_balance_rq() checked but found the
|
||||
load did not require balancing when the cpu was just becoming idle
|
||||
19) # of times in this domain load_balance() tried to move one or more
|
||||
19) # of times in this domain sched_balance_rq() tried to move one or more
|
||||
tasks and failed, when the cpu was just becoming idle
|
||||
20) sum of imbalances discovered (if any) with each call to
|
||||
load_balance() in this domain when the cpu was just becoming idle
|
||||
sched_balance_rq() in this domain when the cpu was just becoming idle
|
||||
21) # of times in this domain pull_task() was called when newly idle
|
||||
22) # of times in this domain pull_task() was called even though the
|
||||
target task was cache-hot when just becoming idle
|
||||
23) # of times in this domain load_balance() was called but did not
|
||||
23) # of times in this domain sched_balance_rq() was called but did not
|
||||
find a busier queue while the cpu was just becoming idle
|
||||
24) # of times in this domain a busier queue was found while the cpu
|
||||
was just becoming idle but no busier group was found
|
||||
|
@ -34,17 +34,17 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这
|
||||
调度域中的负载均衡发生在调度组中。也就是说,每个组被视为一个实体。组的负载被定义为它
|
||||
管辖的每个CPU的负载之和。仅当组的负载不均衡后,任务才在组之间发生迁移。
|
||||
|
||||
在kernel/sched/core.c中,trigger_load_balance()在每个CPU上通过scheduler_tick()
|
||||
在kernel/sched/core.c中,sched_balance_trigger()在每个CPU上通过sched_tick()
|
||||
周期执行。在当前运行队列下一个定期调度再平衡事件到达后,它引发一个软中断。负载均衡真正
|
||||
的工作由run_rebalance_domains()->rebalance_domains()完成,在软中断上下文中执行
|
||||
的工作由sched_balance_softirq()->sched_balance_domains()完成,在软中断上下文中执行
|
||||
(SCHED_SOFTIRQ)。
|
||||
|
||||
后一个函数有两个入参:当前CPU的运行队列、它在scheduler_tick()调用时是否空闲。函数会从
|
||||
后一个函数有两个入参:当前CPU的运行队列、它在sched_tick()调用时是否空闲。函数会从
|
||||
当前CPU所在的基调度域开始迭代执行,并沿着parent指针链向上进入更高层级的调度域。在迭代
|
||||
过程中,函数会检查当前调度域是否已经耗尽了再平衡的时间间隔,如果是,它在该调度域运行
|
||||
load_balance()。接下来它检查父调度域(如果存在),再后来父调度域的父调度域,以此类推。
|
||||
sched_balance_rq()。接下来它检查父调度域(如果存在),再后来父调度域的父调度域,以此类推。
|
||||
|
||||
起初,load_balance()查找当前调度域中最繁忙的调度组。如果成功,在该调度组管辖的全部CPU
|
||||
起初,sched_balance_rq()查找当前调度域中最繁忙的调度组。如果成功,在该调度组管辖的全部CPU
|
||||
的运行队列中找出最繁忙的运行队列。如能找到,对当前的CPU运行队列和新找到的最繁忙运行
|
||||
队列均加锁,并把任务从最繁忙队列中迁移到当前CPU上。被迁移的任务数量等于在先前迭代执行
|
||||
中计算出的该调度域的调度组的不均衡值。
|
||||
|
@ -75,42 +75,42 @@ domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
|
||||
繁忙,新空闲):
|
||||
|
||||
|
||||
1) 当CPU空闲时,load_balance()在这个调度域中被调用了#次
|
||||
2) 当CPU空闲时,load_balance()在这个调度域中被调用,但是发现负载无需
|
||||
1) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用了#次
|
||||
2) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,但是发现负载无需
|
||||
均衡#次
|
||||
3) 当CPU空闲时,load_balance()在这个调度域中被调用,试图迁移1个或更多
|
||||
3) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,试图迁移1个或更多
|
||||
任务且失败了#次
|
||||
4) 当CPU空闲时,load_balance()在这个调度域中被调用,发现不均衡(如果有)
|
||||
4) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,发现不均衡(如果有)
|
||||
#次
|
||||
5) 当CPU空闲时,pull_task()在这个调度域中被调用#次
|
||||
6) 当CPU空闲时,尽管目标任务是热缓存状态,pull_task()依然被调用#次
|
||||
7) 当CPU空闲时,load_balance()在这个调度域中被调用,未能找到更繁忙的
|
||||
7) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,未能找到更繁忙的
|
||||
队列#次
|
||||
8) 当CPU空闲时,在调度域中找到了更繁忙的队列,但未找到更繁忙的调度组
|
||||
#次
|
||||
9) 当CPU繁忙时,load_balance()在这个调度域中被调用了#次
|
||||
10) 当CPU繁忙时,load_balance()在这个调度域中被调用,但是发现负载无需
|
||||
9) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用了#次
|
||||
10) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,但是发现负载无需
|
||||
均衡#次
|
||||
11) 当CPU繁忙时,load_balance()在这个调度域中被调用,试图迁移1个或更多
|
||||
11) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,试图迁移1个或更多
|
||||
任务且失败了#次
|
||||
12) 当CPU繁忙时,load_balance()在这个调度域中被调用,发现不均衡(如果有)
|
||||
12) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,发现不均衡(如果有)
|
||||
#次
|
||||
13) 当CPU繁忙时,pull_task()在这个调度域中被调用#次
|
||||
14) 当CPU繁忙时,尽管目标任务是热缓存状态,pull_task()依然被调用#次
|
||||
15) 当CPU繁忙时,load_balance()在这个调度域中被调用,未能找到更繁忙的
|
||||
15) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,未能找到更繁忙的
|
||||
队列#次
|
||||
16) 当CPU繁忙时,在调度域中找到了更繁忙的队列,但未找到更繁忙的调度组
|
||||
#次
|
||||
17) 当CPU新空闲时,load_balance()在这个调度域中被调用了#次
|
||||
18) 当CPU新空闲时,load_balance()在这个调度域中被调用,但是发现负载无需
|
||||
17) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用了#次
|
||||
18) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,但是发现负载无需
|
||||
均衡#次
|
||||
19) 当CPU新空闲时,load_balance()在这个调度域中被调用,试图迁移1个或更多
|
||||
19) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,试图迁移1个或更多
|
||||
任务且失败了#次
|
||||
20) 当CPU新空闲时,load_balance()在这个调度域中被调用,发现不均衡(如果有)
|
||||
20) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,发现不均衡(如果有)
|
||||
#次
|
||||
21) 当CPU新空闲时,pull_task()在这个调度域中被调用#次
|
||||
22) 当CPU新空闲时,尽管目标任务是热缓存状态,pull_task()依然被调用#次
|
||||
23) 当CPU新空闲时,load_balance()在这个调度域中被调用,未能找到更繁忙的
|
||||
23) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,未能找到更繁忙的
|
||||
队列#次
|
||||
24) 当CPU新空闲时,在调度域中找到了更繁忙的队列,但未找到更繁忙的调度组
|
||||
#次
|
||||
|
@ -22,9 +22,9 @@
|
||||
/* Enable topology flag updates */
|
||||
#define arch_update_cpu_topology topology_update_cpu_topology
|
||||
|
||||
/* Replace task scheduler's default thermal pressure API */
|
||||
#define arch_scale_thermal_pressure topology_get_thermal_pressure
|
||||
#define arch_update_thermal_pressure topology_update_thermal_pressure
|
||||
/* Replace task scheduler's default HW pressure API */
|
||||
#define arch_scale_hw_pressure topology_get_hw_pressure
|
||||
#define arch_update_hw_pressure topology_update_hw_pressure
|
||||
|
||||
#else
|
||||
|
||||
|
@ -42,7 +42,7 @@
|
||||
* can take this difference into account during load balance. A per cpu
|
||||
* structure is preferred because each CPU updates its own cpu_capacity field
|
||||
* during the load balance except for idle cores. One idle core is selected
|
||||
* to run the rebalance_domains for all idle cores and the cpu_capacity can be
|
||||
* to run the sched_balance_domains for all idle cores and the cpu_capacity can be
|
||||
* updated during this sequence.
|
||||
*/
|
||||
|
||||
|
@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
|
||||
/* Enable topology flag updates */
|
||||
#define arch_update_cpu_topology topology_update_cpu_topology
|
||||
|
||||
/* Replace task scheduler's default thermal pressure API */
|
||||
#define arch_scale_thermal_pressure topology_get_thermal_pressure
|
||||
#define arch_update_thermal_pressure topology_update_thermal_pressure
|
||||
/* Replace task scheduler's default HW pressure API */
|
||||
#define arch_scale_hw_pressure topology_get_hw_pressure
|
||||
#define arch_update_hw_pressure topology_update_hw_pressure
|
||||
|
||||
#include <asm-generic/topology.h>
|
||||
|
||||
|
@ -6,5 +6,4 @@ generic-y += agp.h
|
||||
generic-y += kvm_types.h
|
||||
generic-y += mcs_spinlock.h
|
||||
generic-y += qrwlock.h
|
||||
generic-y += vtime.h
|
||||
generic-y += early_ioremap.h
|
||||
|
@ -32,23 +32,10 @@
|
||||
#ifdef CONFIG_PPC64
|
||||
#define get_accounting(tsk) (&get_paca()->accounting)
|
||||
#define raw_get_accounting(tsk) (&local_paca->accounting)
|
||||
static inline void arch_vtime_task_switch(struct task_struct *tsk) { }
|
||||
|
||||
#else
|
||||
#define get_accounting(tsk) (&task_thread_info(tsk)->accounting)
|
||||
#define raw_get_accounting(tsk) get_accounting(tsk)
|
||||
/*
|
||||
* Called from the context switch with interrupts disabled, to charge all
|
||||
* accumulated times to the current process, and to prepare accounting on
|
||||
* the next process.
|
||||
*/
|
||||
static inline void arch_vtime_task_switch(struct task_struct *prev)
|
||||
{
|
||||
struct cpu_accounting_data *acct = get_accounting(current);
|
||||
struct cpu_accounting_data *acct0 = get_accounting(prev);
|
||||
|
||||
acct->starttime = acct0->starttime;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -354,6 +354,28 @@ void vtime_flush(struct task_struct *tsk)
|
||||
acct->hardirq_time = 0;
|
||||
acct->softirq_time = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from the context switch with interrupts disabled, to charge all
|
||||
* accumulated times to the current process, and to prepare accounting on
|
||||
* the next process.
|
||||
*/
|
||||
void vtime_task_switch(struct task_struct *prev)
|
||||
{
|
||||
if (is_idle_task(prev))
|
||||
vtime_account_idle(prev);
|
||||
else
|
||||
vtime_account_kernel(prev);
|
||||
|
||||
vtime_flush(prev);
|
||||
|
||||
if (!IS_ENABLED(CONFIG_PPC64)) {
|
||||
struct cpu_accounting_data *acct = get_accounting(current);
|
||||
struct cpu_accounting_data *acct0 = get_accounting(prev);
|
||||
|
||||
acct->starttime = acct0->starttime;
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
|
||||
|
||||
void __no_kcsan __delay(unsigned long loops)
|
||||
|
@ -2,8 +2,6 @@
|
||||
#ifndef _S390_VTIME_H
|
||||
#define _S390_VTIME_H
|
||||
|
||||
#define __ARCH_HAS_VTIME_TASK_SWITCH
|
||||
|
||||
static inline void update_timer_sys(void)
|
||||
{
|
||||
S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer;
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include <asm/hw_irq.h>
|
||||
#include <asm/stacktrace.h>
|
||||
#include <asm/softirq_stack.h>
|
||||
#include <asm/vtime.h>
|
||||
#include "entry.h"
|
||||
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include <asm/crw.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/pai.h>
|
||||
#include <asm/vtime.h>
|
||||
|
||||
struct mcck_struct {
|
||||
unsigned int kill_task : 1;
|
||||
|
@ -22,7 +22,7 @@
|
||||
#include <linux/units.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/thermal_pressure.h>
|
||||
#include <trace/events/hw_pressure.h>
|
||||
|
||||
static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
|
||||
static struct cpumask scale_freq_counters_mask;
|
||||
@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
|
||||
per_cpu(cpu_scale, cpu) = capacity;
|
||||
}
|
||||
|
||||
DEFINE_PER_CPU(unsigned long, thermal_pressure);
|
||||
DEFINE_PER_CPU(unsigned long, hw_pressure);
|
||||
|
||||
/**
|
||||
* topology_update_thermal_pressure() - Update thermal pressure for CPUs
|
||||
* topology_update_hw_pressure() - Update HW pressure for CPUs
|
||||
* @cpus : The related CPUs for which capacity has been reduced
|
||||
* @capped_freq : The maximum allowed frequency that CPUs can run at
|
||||
*
|
||||
* Update the value of thermal pressure for all @cpus in the mask. The
|
||||
* Update the value of HW pressure for all @cpus in the mask. The
|
||||
* cpumask should include all (online+offline) affected CPUs, to avoid
|
||||
* operating on stale data when hot-plug is used for some CPUs. The
|
||||
* @capped_freq reflects the currently allowed max CPUs frequency due to
|
||||
* thermal capping. It might be also a boost frequency value, which is bigger
|
||||
* HW capping. It might be also a boost frequency value, which is bigger
|
||||
* than the internal 'capacity_freq_ref' max frequency. In such case the
|
||||
* pressure value should simply be removed, since this is an indication that
|
||||
* there is no thermal throttling. The @capped_freq must be provided in kHz.
|
||||
* there is no HW throttling. The @capped_freq must be provided in kHz.
|
||||
*/
|
||||
void topology_update_thermal_pressure(const struct cpumask *cpus,
|
||||
void topology_update_hw_pressure(const struct cpumask *cpus,
|
||||
unsigned long capped_freq)
|
||||
{
|
||||
unsigned long max_capacity, capacity, th_pressure;
|
||||
unsigned long max_capacity, capacity, hw_pressure;
|
||||
u32 max_freq;
|
||||
int cpu;
|
||||
|
||||
@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus,
|
||||
|
||||
/*
|
||||
* Handle properly the boost frequencies, which should simply clean
|
||||
* the thermal pressure value.
|
||||
* the HW pressure value.
|
||||
*/
|
||||
if (max_freq <= capped_freq)
|
||||
capacity = max_capacity;
|
||||
else
|
||||
capacity = mult_frac(max_capacity, capped_freq, max_freq);
|
||||
|
||||
th_pressure = max_capacity - capacity;
|
||||
hw_pressure = max_capacity - capacity;
|
||||
|
||||
trace_thermal_pressure_update(cpu, th_pressure);
|
||||
trace_hw_pressure_update(cpu, hw_pressure);
|
||||
|
||||
for_each_cpu(cpu, cpus)
|
||||
WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
|
||||
WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);
|
||||
EXPORT_SYMBOL_GPL(topology_update_hw_pressure);
|
||||
|
||||
static ssize_t cpu_capacity_show(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
|
@ -2582,6 +2582,40 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
|
||||
}
|
||||
EXPORT_SYMBOL(cpufreq_get_policy);
|
||||
|
||||
DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
|
||||
|
||||
/**
|
||||
* cpufreq_update_pressure() - Update cpufreq pressure for CPUs
|
||||
* @policy: cpufreq policy of the CPUs.
|
||||
*
|
||||
* Update the value of cpufreq pressure for all @cpus in the policy.
|
||||
*/
|
||||
static void cpufreq_update_pressure(struct cpufreq_policy *policy)
|
||||
{
|
||||
unsigned long max_capacity, capped_freq, pressure;
|
||||
u32 max_freq;
|
||||
int cpu;
|
||||
|
||||
cpu = cpumask_first(policy->related_cpus);
|
||||
max_freq = arch_scale_freq_ref(cpu);
|
||||
capped_freq = policy->max;
|
||||
|
||||
/*
|
||||
* Handle properly the boost frequencies, which should simply clean
|
||||
* the cpufreq pressure value.
|
||||
*/
|
||||
if (max_freq <= capped_freq) {
|
||||
pressure = 0;
|
||||
} else {
|
||||
max_capacity = arch_scale_cpu_capacity(cpu);
|
||||
pressure = max_capacity -
|
||||
mult_frac(max_capacity, capped_freq, max_freq);
|
||||
}
|
||||
|
||||
for_each_cpu(cpu, policy->related_cpus)
|
||||
WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
|
||||
}
|
||||
|
||||
/**
|
||||
* cpufreq_set_policy - Modify cpufreq policy parameters.
|
||||
* @policy: Policy object to modify.
|
||||
@ -2637,6 +2671,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
|
||||
policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
|
||||
trace_cpu_frequency_limits(policy);
|
||||
|
||||
cpufreq_update_pressure(policy);
|
||||
|
||||
policy->cached_target_freq = UINT_MAX;
|
||||
|
||||
pr_debug("new min and max freqs are %u - %u kHz\n",
|
||||
|
@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
|
||||
|
||||
throttled_freq = freq_hz / HZ_PER_KHZ;
|
||||
|
||||
/* Update thermal pressure (the boost frequencies are accepted) */
|
||||
arch_update_thermal_pressure(policy->related_cpus, throttled_freq);
|
||||
/* Update HW pressure (the boost frequencies are accepted) */
|
||||
arch_update_hw_pressure(policy->related_cpus, throttled_freq);
|
||||
|
||||
/*
|
||||
* In the unlikely case policy is unregistered do not enable
|
||||
|
@ -477,7 +477,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
|
||||
unsigned long state)
|
||||
{
|
||||
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
|
||||
struct cpumask *cpus;
|
||||
unsigned int frequency;
|
||||
int ret;
|
||||
|
||||
@ -494,8 +493,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
|
||||
ret = freq_qos_update_request(&cpufreq_cdev->qos_req, frequency);
|
||||
if (ret >= 0) {
|
||||
cpufreq_cdev->cpufreq_state = state;
|
||||
cpus = cpufreq_cdev->policy->related_cpus;
|
||||
arch_update_thermal_pressure(cpus, frequency);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
|
@ -1 +0,0 @@
|
||||
/* no content, but patch(1) dislikes empty files */
|
@ -60,14 +60,14 @@ void topology_scale_freq_tick(void);
|
||||
void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus);
|
||||
void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus);
|
||||
|
||||
DECLARE_PER_CPU(unsigned long, thermal_pressure);
|
||||
DECLARE_PER_CPU(unsigned long, hw_pressure);
|
||||
|
||||
static inline unsigned long topology_get_thermal_pressure(int cpu)
|
||||
static inline unsigned long topology_get_hw_pressure(int cpu)
|
||||
{
|
||||
return per_cpu(thermal_pressure, cpu);
|
||||
return per_cpu(hw_pressure, cpu);
|
||||
}
|
||||
|
||||
void topology_update_thermal_pressure(const struct cpumask *cpus,
|
||||
void topology_update_hw_pressure(const struct cpumask *cpus,
|
||||
unsigned long capped_freq);
|
||||
|
||||
struct cpu_topology {
|
||||
|
@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
|
||||
void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
|
||||
void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
|
||||
bool has_target_index(void);
|
||||
|
||||
DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
|
||||
static inline unsigned long cpufreq_get_pressure(int cpu)
|
||||
{
|
||||
return READ_ONCE(per_cpu(cpufreq_pressure, cpu));
|
||||
}
|
||||
#else
|
||||
static inline unsigned int cpufreq_get(unsigned int cpu)
|
||||
{
|
||||
@ -264,6 +270,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
|
||||
}
|
||||
static inline void disable_cpufreq(void) { }
|
||||
static inline void cpufreq_update_limits(unsigned int cpu) { }
|
||||
static inline unsigned long cpufreq_get_pressure(int cpu)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CPU_FREQ_STAT
|
||||
|
@ -301,7 +301,7 @@ enum {
|
||||
TASK_COMM_LEN = 16,
|
||||
};
|
||||
|
||||
extern void scheduler_tick(void);
|
||||
extern void sched_tick(void);
|
||||
|
||||
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
|
||||
|
||||
@ -835,6 +835,7 @@ struct task_struct {
|
||||
#endif
|
||||
|
||||
unsigned int policy;
|
||||
unsigned long max_allowed_capacity;
|
||||
int nr_cpus_allowed;
|
||||
const cpumask_t *cpus_ptr;
|
||||
cpumask_t *user_cpus_ptr;
|
||||
|
@ -5,8 +5,8 @@
|
||||
#include <linux/sched.h>
|
||||
|
||||
enum cpu_idle_type {
|
||||
__CPU_NOT_IDLE = 0,
|
||||
CPU_IDLE,
|
||||
CPU_NOT_IDLE,
|
||||
CPU_NEWLY_IDLE,
|
||||
CPU_MAX_IDLE_TYPES
|
||||
};
|
||||
|
@ -110,7 +110,7 @@ struct sched_domain {
|
||||
unsigned long last_decay_max_lb_cost;
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
/* load_balance() stats */
|
||||
/* sched_balance_rq() stats */
|
||||
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
|
||||
@ -270,17 +270,17 @@ unsigned long arch_scale_cpu_capacity(int cpu)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_scale_thermal_pressure
|
||||
#ifndef arch_scale_hw_pressure
|
||||
static __always_inline
|
||||
unsigned long arch_scale_thermal_pressure(int cpu)
|
||||
unsigned long arch_scale_hw_pressure(int cpu)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_update_thermal_pressure
|
||||
#ifndef arch_update_hw_pressure
|
||||
static __always_inline
|
||||
void arch_update_thermal_pressure(const struct cpumask *cpus,
|
||||
void arch_update_hw_pressure(const struct cpumask *cpus,
|
||||
unsigned long capped_frequency)
|
||||
{ }
|
||||
#endif
|
||||
|
@ -5,10 +5,6 @@
|
||||
#include <linux/context_tracking_state.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
||||
#include <asm/vtime.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Common vtime APIs
|
||||
*/
|
||||
@ -18,7 +14,6 @@ extern void vtime_account_idle(struct task_struct *tsk);
|
||||
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
|
||||
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
||||
extern void arch_vtime_task_switch(struct task_struct *tsk);
|
||||
extern void vtime_user_enter(struct task_struct *tsk);
|
||||
extern void vtime_user_exit(struct task_struct *tsk);
|
||||
extern void vtime_guest_enter(struct task_struct *tsk);
|
||||
|
@ -1,27 +1,27 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM thermal_pressure
|
||||
#define TRACE_SYSTEM hw_pressure
|
||||
|
||||
#if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_THERMAL_PRESSURE_H
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
|
||||
TRACE_EVENT(thermal_pressure_update,
|
||||
TP_PROTO(int cpu, unsigned long thermal_pressure),
|
||||
TP_ARGS(cpu, thermal_pressure),
|
||||
TRACE_EVENT(hw_pressure_update,
|
||||
TP_PROTO(int cpu, unsigned long hw_pressure),
|
||||
TP_ARGS(cpu, hw_pressure),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned long, thermal_pressure)
|
||||
__field(unsigned long, hw_pressure)
|
||||
__field(int, cpu)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->thermal_pressure = thermal_pressure;
|
||||
__entry->hw_pressure = hw_pressure;
|
||||
__entry->cpu = cpu;
|
||||
),
|
||||
|
||||
TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure)
|
||||
TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure)
|
||||
);
|
||||
#endif /* _TRACE_THERMAL_PRESSURE_H */
|
||||
|
@ -787,7 +787,7 @@ DECLARE_TRACE(pelt_dl_tp,
|
||||
TP_PROTO(struct rq *rq),
|
||||
TP_ARGS(rq));
|
||||
|
||||
DECLARE_TRACE(pelt_thermal_tp,
|
||||
DECLARE_TRACE(pelt_hw_tp,
|
||||
TP_PROTO(struct rq *rq),
|
||||
TP_ARGS(rq));
|
||||
|
||||
|
12
init/Kconfig
12
init/Kconfig
@ -547,24 +547,24 @@ config HAVE_SCHED_AVG_IRQ
|
||||
depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
|
||||
depends on SMP
|
||||
|
||||
config SCHED_THERMAL_PRESSURE
|
||||
config SCHED_HW_PRESSURE
|
||||
bool
|
||||
default y if ARM && ARM_CPU_TOPOLOGY
|
||||
default y if ARM64
|
||||
depends on SMP
|
||||
depends on CPU_FREQ_THERMAL
|
||||
help
|
||||
Select this option to enable thermal pressure accounting in the
|
||||
scheduler. Thermal pressure is the value conveyed to the scheduler
|
||||
Select this option to enable HW pressure accounting in the
|
||||
scheduler. HW pressure is the value conveyed to the scheduler
|
||||
that reflects the reduction in CPU compute capacity resulted from
|
||||
thermal throttling. Thermal throttling occurs when the performance of
|
||||
a CPU is capped due to high operating temperatures.
|
||||
HW throttling. HW throttling occurs when the performance of
|
||||
a CPU is capped due to high operating temperatures as an example.
|
||||
|
||||
If selected, the scheduler will be able to balance tasks accordingly,
|
||||
i.e. put less load on throttled CPUs than on non/less throttled ones.
|
||||
|
||||
This requires the architecture to implement
|
||||
arch_update_thermal_pressure() and arch_scale_thermal_pressure().
|
||||
arch_update_hw_pressure() and arch_scale_thermal_pressure().
|
||||
|
||||
config BSD_PROCESS_ACCT
|
||||
bool "BSD Process Accounting"
|
||||
|
@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
|
||||
.cpus_ptr = &init_task.cpus_mask,
|
||||
.user_cpus_ptr = NULL,
|
||||
.cpus_mask = CPU_MASK_ALL,
|
||||
.max_allowed_capacity = SCHED_CAPACITY_SCALE,
|
||||
.nr_cpus_allowed= NR_CPUS,
|
||||
.mm = NULL,
|
||||
.active_mm = &init_mm,
|
||||
|
@ -108,7 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
|
||||
@ -5662,13 +5662,13 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
|
||||
* This function gets called by the timer code, with HZ frequency.
|
||||
* We call it with interrupts disabled.
|
||||
*/
|
||||
void scheduler_tick(void)
|
||||
void sched_tick(void)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct task_struct *curr = rq->curr;
|
||||
struct rq_flags rf;
|
||||
unsigned long thermal_pressure;
|
||||
unsigned long hw_pressure;
|
||||
u64 resched_latency;
|
||||
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
|
||||
@ -5679,8 +5679,8 @@ void scheduler_tick(void)
|
||||
rq_lock(rq, &rf);
|
||||
|
||||
update_rq_clock(rq);
|
||||
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
|
||||
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
|
||||
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
|
||||
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
if (sched_feat(LATENCY_WARN))
|
||||
resched_latency = cpu_resched_latency(rq);
|
||||
@ -5700,7 +5700,7 @@ void scheduler_tick(void)
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
rq->idle_balance = idle_cpu(cpu);
|
||||
trigger_load_balance(rq);
|
||||
sched_balance_trigger(rq);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -6585,7 +6585,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
* paths. For example, see arch/x86/entry_64.S.
|
||||
*
|
||||
* To drive preemption between tasks, the scheduler sets the flag in timer
|
||||
* interrupt handler scheduler_tick().
|
||||
* interrupt handler sched_tick().
|
||||
*
|
||||
* 3. Wakeups don't really cause entry into schedule(). They add a
|
||||
* task to the run-queue and that's it.
|
||||
|
@ -424,19 +424,6 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
|
||||
*/
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
||||
|
||||
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
|
||||
void vtime_task_switch(struct task_struct *prev)
|
||||
{
|
||||
if (is_idle_task(prev))
|
||||
vtime_account_idle(prev);
|
||||
else
|
||||
vtime_account_kernel(prev);
|
||||
|
||||
vtime_flush(prev);
|
||||
arch_vtime_task_switch(prev);
|
||||
}
|
||||
# endif
|
||||
|
||||
void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
|
||||
{
|
||||
unsigned int pc = irq_count() - offset;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -379,7 +379,7 @@ void calc_global_load(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from scheduler_tick() to periodically update this CPU's
|
||||
* Called from sched_tick() to periodically update this CPU's
|
||||
* active count.
|
||||
*/
|
||||
void calc_global_load_tick(struct rq *this_rq)
|
||||
|
@ -208,8 +208,8 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
|
||||
* se has been already dequeued but cfs_rq->curr still points to it.
|
||||
* This means that weight will be 0 but not running for a sched_entity
|
||||
* but also for a cfs_rq if the latter becomes idle. As an example,
|
||||
* this happens during idle_balance() which calls
|
||||
* update_blocked_averages().
|
||||
* this happens during sched_balance_newidle() which calls
|
||||
* sched_balance_update_blocked_averages().
|
||||
*
|
||||
* Also see the comment in accumulate_sum().
|
||||
*/
|
||||
@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_THERMAL_PRESSURE
|
||||
#ifdef CONFIG_SCHED_HW_PRESSURE
|
||||
/*
|
||||
* thermal:
|
||||
* hardware:
|
||||
*
|
||||
* load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
|
||||
*
|
||||
* util_avg and runnable_load_avg are not supported and meaningless.
|
||||
*
|
||||
* Unlike rt/dl utilization tracking that track time spent by a cpu
|
||||
* running a rt/dl task through util_avg, the average thermal pressure is
|
||||
* tracked through load_avg. This is because thermal pressure signal is
|
||||
* running a rt/dl task through util_avg, the average HW pressure is
|
||||
* tracked through load_avg. This is because HW pressure signal is
|
||||
* time weighted "delta" capacity unlike util_avg which is binary.
|
||||
* "delta capacity" = actual capacity -
|
||||
* capped capacity a cpu due to a thermal event.
|
||||
* capped capacity a cpu due to a HW event.
|
||||
*/
|
||||
|
||||
int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
{
|
||||
if (___update_load_sum(now, &rq->avg_thermal,
|
||||
if (___update_load_sum(now, &rq->avg_hw,
|
||||
capacity,
|
||||
capacity,
|
||||
capacity)) {
|
||||
___update_load_avg(&rq->avg_thermal, 1);
|
||||
trace_pelt_thermal_tp(rq);
|
||||
___update_load_avg(&rq->avg_hw, 1);
|
||||
trace_pelt_hw_tp(rq);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
|
||||
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
|
||||
#ifdef CONFIG_SCHED_THERMAL_PRESSURE
|
||||
int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
|
||||
#ifdef CONFIG_SCHED_HW_PRESSURE
|
||||
int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
|
||||
|
||||
static inline u64 thermal_load_avg(struct rq *rq)
|
||||
static inline u64 hw_load_avg(struct rq *rq)
|
||||
{
|
||||
return READ_ONCE(rq->avg_thermal.load_avg);
|
||||
return READ_ONCE(rq->avg_hw.load_avg);
|
||||
}
|
||||
#else
|
||||
static inline int
|
||||
update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline u64 thermal_load_avg(struct rq *rq)
|
||||
static inline u64 hw_load_avg(struct rq *rq)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
}
|
||||
|
||||
static inline int
|
||||
update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline u64 thermal_load_avg(struct rq *rq)
|
||||
static inline u64 hw_load_avg(struct rq *rq)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
@ -111,6 +111,20 @@ extern int sysctl_sched_rt_period;
|
||||
extern int sysctl_sched_rt_runtime;
|
||||
extern int sched_rr_timeslice;
|
||||
|
||||
/*
|
||||
* Asymmetric CPU capacity bits
|
||||
*/
|
||||
struct asym_cap_data {
|
||||
struct list_head link;
|
||||
struct rcu_head rcu;
|
||||
unsigned long capacity;
|
||||
unsigned long cpus[];
|
||||
};
|
||||
|
||||
extern struct list_head asym_cap_list;
|
||||
|
||||
#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
|
||||
|
||||
/*
|
||||
* Helpers for converting nanosecond timing to jiffy resolution
|
||||
*/
|
||||
@ -701,7 +715,7 @@ struct rt_rq {
|
||||
} highest_prio;
|
||||
#endif
|
||||
#ifdef CONFIG_SMP
|
||||
int overloaded;
|
||||
bool overloaded;
|
||||
struct plist_head pushable_tasks;
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
@ -745,7 +759,7 @@ struct dl_rq {
|
||||
u64 next;
|
||||
} earliest_dl;
|
||||
|
||||
int overloaded;
|
||||
bool overloaded;
|
||||
|
||||
/*
|
||||
* Tasks on this rq that can be pushed away. They are kept in
|
||||
@ -838,10 +852,6 @@ struct perf_domain {
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
/* Scheduling group status flags */
|
||||
#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
|
||||
#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
|
||||
|
||||
/*
|
||||
* We add the notion of a root-domain which will be used to define per-domain
|
||||
* variables. Each exclusive cpuset essentially defines an island domain by
|
||||
@ -862,10 +872,10 @@ struct root_domain {
|
||||
* - More than one runnable task
|
||||
* - Running task is misfit
|
||||
*/
|
||||
int overload;
|
||||
bool overloaded;
|
||||
|
||||
/* Indicate one or more cpus over-utilized (tipping point) */
|
||||
int overutilized;
|
||||
bool overutilized;
|
||||
|
||||
/*
|
||||
* The bit corresponding to a CPU gets set here if such CPU has more
|
||||
@ -905,8 +915,6 @@ struct root_domain {
|
||||
cpumask_var_t rto_mask;
|
||||
struct cpupri cpupri;
|
||||
|
||||
unsigned long max_cpu_capacity;
|
||||
|
||||
/*
|
||||
* NULL-terminated list of performance domains intersecting with the
|
||||
* CPUs of the rd. Protected by RCU.
|
||||
@ -920,6 +928,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
|
||||
extern void sched_get_rd(struct root_domain *rd);
|
||||
extern void sched_put_rd(struct root_domain *rd);
|
||||
|
||||
static inline int get_rd_overloaded(struct root_domain *rd)
|
||||
{
|
||||
return READ_ONCE(rd->overloaded);
|
||||
}
|
||||
|
||||
static inline void set_rd_overloaded(struct root_domain *rd, int status)
|
||||
{
|
||||
if (get_rd_overloaded(rd) != status)
|
||||
WRITE_ONCE(rd->overloaded, status);
|
||||
}
|
||||
|
||||
#ifdef HAVE_RT_PUSH_IPI
|
||||
extern void rto_push_irq_work_func(struct irq_work *work);
|
||||
#endif
|
||||
@ -1091,8 +1110,8 @@ struct rq {
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
struct sched_avg avg_irq;
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_THERMAL_PRESSURE
|
||||
struct sched_avg avg_thermal;
|
||||
#ifdef CONFIG_SCHED_HW_PRESSURE
|
||||
struct sched_avg avg_hw;
|
||||
#endif
|
||||
u64 idle_stamp;
|
||||
u64 avg_idle;
|
||||
@ -1533,24 +1552,6 @@ static inline u64 rq_clock_task(struct rq *rq)
|
||||
return rq->clock_task;
|
||||
}
|
||||
|
||||
/**
|
||||
* By default the decay is the default pelt decay period.
|
||||
* The decay shift can change the decay period in
|
||||
* multiples of 32.
|
||||
* Decay shift Decay period(ms)
|
||||
* 0 32
|
||||
* 1 64
|
||||
* 2 128
|
||||
* 3 256
|
||||
* 4 512
|
||||
*/
|
||||
extern int sched_thermal_decay_shift;
|
||||
|
||||
static inline u64 rq_clock_thermal(struct rq *rq)
|
||||
{
|
||||
return rq_clock_task(rq) >> sched_thermal_decay_shift;
|
||||
}
|
||||
|
||||
static inline void rq_clock_skip_update(struct rq *rq)
|
||||
{
|
||||
lockdep_assert_rq_held(rq);
|
||||
@ -2399,7 +2400,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);
|
||||
|
||||
extern void update_group_capacity(struct sched_domain *sd, int cpu);
|
||||
|
||||
extern void trigger_load_balance(struct rq *rq);
|
||||
extern void sched_balance_trigger(struct rq *rq);
|
||||
|
||||
extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
|
||||
|
||||
@ -2519,10 +2520,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (prev_nr < 2 && rq->nr_running >= 2) {
|
||||
if (!READ_ONCE(rq->rd->overload))
|
||||
WRITE_ONCE(rq->rd->overload, 1);
|
||||
}
|
||||
if (prev_nr < 2 && rq->nr_running >= 2)
|
||||
set_rd_overloaded(rq->rd, 1);
|
||||
#endif
|
||||
|
||||
sched_update_tick_dependency(rq);
|
||||
@ -2906,7 +2905,7 @@ extern void cfs_bandwidth_usage_dec(void);
|
||||
#define NOHZ_NEWILB_KICK_BIT 2
|
||||
#define NOHZ_NEXT_KICK_BIT 3
|
||||
|
||||
/* Run rebalance_domains() */
|
||||
/* Run sched_balance_domains() */
|
||||
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
|
||||
/* Update blocked load */
|
||||
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
|
||||
|
@ -113,7 +113,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
|
||||
* Bump this up when changing the output format or the meaning of an existing
|
||||
* format, so that tools can adapt (or abort)
|
||||
*/
|
||||
#define SCHEDSTAT_VERSION 15
|
||||
#define SCHEDSTAT_VERSION 16
|
||||
|
||||
static int show_schedstat(struct seq_file *seq, void *v)
|
||||
{
|
||||
@ -150,8 +150,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
|
||||
|
||||
seq_printf(seq, "domain%d %*pb", dcount++,
|
||||
cpumask_pr_args(sched_domain_span(sd)));
|
||||
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
|
||||
itype++) {
|
||||
for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
|
||||
seq_printf(seq, " %u %u %u %u %u %u %u %u",
|
||||
sd->lb_count[itype],
|
||||
sd->lb_balanced[itype],
|
||||
|
@ -1329,24 +1329,13 @@ next:
|
||||
update_group_capacity(sd, cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Asymmetric CPU capacity bits
|
||||
*/
|
||||
struct asym_cap_data {
|
||||
struct list_head link;
|
||||
unsigned long capacity;
|
||||
unsigned long cpus[];
|
||||
};
|
||||
|
||||
/*
|
||||
* Set of available CPUs grouped by their corresponding capacities
|
||||
* Each list entry contains a CPU mask reflecting CPUs that share the same
|
||||
* capacity.
|
||||
* The lifespan of data is unlimited.
|
||||
*/
|
||||
static LIST_HEAD(asym_cap_list);
|
||||
|
||||
#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
|
||||
LIST_HEAD(asym_cap_list);
|
||||
|
||||
/*
|
||||
* Verify whether there is any CPU capacity asymmetry in a given sched domain.
|
||||
@ -1386,21 +1375,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span,
|
||||
|
||||
}
|
||||
|
||||
static void free_asym_cap_entry(struct rcu_head *head)
|
||||
{
|
||||
struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
|
||||
kfree(entry);
|
||||
}
|
||||
|
||||
static inline void asym_cpu_capacity_update_data(int cpu)
|
||||
{
|
||||
unsigned long capacity = arch_scale_cpu_capacity(cpu);
|
||||
struct asym_cap_data *entry = NULL;
|
||||
struct asym_cap_data *insert_entry = NULL;
|
||||
struct asym_cap_data *entry;
|
||||
|
||||
/*
|
||||
* Search if capacity already exits. If not, track which the entry
|
||||
* where we should insert to keep the list ordered descendingly.
|
||||
*/
|
||||
list_for_each_entry(entry, &asym_cap_list, link) {
|
||||
if (capacity == entry->capacity)
|
||||
goto done;
|
||||
else if (!insert_entry && capacity > entry->capacity)
|
||||
insert_entry = list_prev_entry(entry, link);
|
||||
}
|
||||
|
||||
entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
|
||||
if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
|
||||
return;
|
||||
entry->capacity = capacity;
|
||||
list_add(&entry->link, &asym_cap_list);
|
||||
|
||||
/* If NULL then the new capacity is the smallest, add last. */
|
||||
if (!insert_entry)
|
||||
list_add_tail_rcu(&entry->link, &asym_cap_list);
|
||||
else
|
||||
list_add_rcu(&entry->link, &insert_entry->link);
|
||||
done:
|
||||
__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
|
||||
}
|
||||
@ -1423,8 +1430,8 @@ static void asym_cpu_capacity_scan(void)
|
||||
|
||||
list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
|
||||
if (cpumask_empty(cpu_capacity_span(entry))) {
|
||||
list_del(&entry->link);
|
||||
kfree(entry);
|
||||
list_del_rcu(&entry->link);
|
||||
call_rcu(&entry->rcu, free_asym_cap_entry);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1434,8 +1441,8 @@ static void asym_cpu_capacity_scan(void)
|
||||
*/
|
||||
if (list_is_singular(&asym_cap_list)) {
|
||||
entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
|
||||
list_del(&entry->link);
|
||||
kfree(entry);
|
||||
list_del_rcu(&entry->link);
|
||||
call_rcu(&entry->rcu, free_asym_cap_entry);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2507,16 +2514,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
||||
/* Attach the domains */
|
||||
rcu_read_lock();
|
||||
for_each_cpu(i, cpu_map) {
|
||||
unsigned long capacity;
|
||||
|
||||
rq = cpu_rq(i);
|
||||
sd = *per_cpu_ptr(d.sd, i);
|
||||
|
||||
capacity = arch_scale_cpu_capacity(i);
|
||||
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
|
||||
if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
|
||||
WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
|
||||
|
||||
cpu_attach_domain(sd, d.rd, i);
|
||||
|
||||
if (lowest_flag_domain(i, SD_CLUSTER))
|
||||
@ -2530,10 +2530,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
||||
if (has_cluster)
|
||||
static_branch_inc_cpuslocked(&sched_cluster_active);
|
||||
|
||||
if (rq && sched_debug_verbose) {
|
||||
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
|
||||
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
|
||||
}
|
||||
if (rq && sched_debug_verbose)
|
||||
pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
|
||||
|
||||
ret = 0;
|
||||
error:
|
||||
|
@ -2488,7 +2488,7 @@ void update_process_times(int user_tick)
|
||||
if (in_irq())
|
||||
irq_work_tick();
|
||||
#endif
|
||||
scheduler_tick();
|
||||
sched_tick();
|
||||
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
|
||||
run_posix_cpu_timers();
|
||||
}
|
||||
|
@ -1468,7 +1468,7 @@ void wq_worker_sleeping(struct task_struct *task)
|
||||
* wq_worker_tick - a scheduler tick occurred while a kworker is running
|
||||
* @task: task currently running
|
||||
*
|
||||
* Called from scheduler_tick(). We're in the IRQ context and the current
|
||||
* Called from sched_tick(). We're in the IRQ context and the current
|
||||
* worker's fields which follow the 'K' locking rule can be accessed safely.
|
||||
*/
|
||||
void wq_worker_tick(struct task_struct *task)
|
||||
|
@ -1251,7 +1251,7 @@ config SCHED_INFO
|
||||
|
||||
config SCHEDSTATS
|
||||
bool "Collect scheduler statistics"
|
||||
depends on DEBUG_KERNEL && PROC_FS
|
||||
depends on PROC_FS
|
||||
select SCHED_INFO
|
||||
help
|
||||
If you say Y here, additional code will be inserted into the
|
||||
|
@ -19,7 +19,7 @@ fail() { # mesg
|
||||
|
||||
FILTER=set_ftrace_filter
|
||||
FUNC1="schedule"
|
||||
FUNC2="scheduler_tick"
|
||||
FUNC2="sched_tick"
|
||||
|
||||
ALL_FUNCS="#### all functions enabled ####"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user