psi: Fix cpu.pressure for cpu.max and competing cgroups
For simplicity, cpu pressure is defined as having more than one runnable task on a given CPU. This works on the system-level, but it has limitations in a cgrouped reality: When cpu.max is in use, it doesn't capture the time in which a task is not executing on the CPU due to throttling. Likewise, it doesn't capture the time in which a competing cgroup is occupying the CPU - meaning it only reflects cgroup-internal competitive pressure, not outside pressure. Enable tracking of currently executing tasks, and then change the definition of cpu pressure in a cgroup from NR_RUNNING > 1 to NR_RUNNING > ON_CPU which will capture the effects of cpu.max as well as competition from outside the cgroup. After this patch, a cgroup running `stress -c 1` with a cpu.max setting of 5000 10000 shows ~50% continuous CPU pressure. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20200316191333.115523-2-hannes@cmpxchg.org
This commit is contained in:
parent
46a87b3851
commit
b05e75d611
@ -14,13 +14,21 @@ enum psi_task_count {
|
|||||||
NR_IOWAIT,
|
NR_IOWAIT,
|
||||||
NR_MEMSTALL,
|
NR_MEMSTALL,
|
||||||
NR_RUNNING,
|
NR_RUNNING,
|
||||||
NR_PSI_TASK_COUNTS = 3,
|
/*
|
||||||
|
* This can't have values other than 0 or 1 and could be
|
||||||
|
* implemented as a bit flag. But for now we still have room
|
||||||
|
* in the first cacheline of psi_group_cpu, and this way we
|
||||||
|
* don't have to special case any state tracking for it.
|
||||||
|
*/
|
||||||
|
NR_ONCPU,
|
||||||
|
NR_PSI_TASK_COUNTS = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Task state bitmasks */
|
/* Task state bitmasks */
|
||||||
#define TSK_IOWAIT (1 << NR_IOWAIT)
|
#define TSK_IOWAIT (1 << NR_IOWAIT)
|
||||||
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
|
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
|
||||||
#define TSK_RUNNING (1 << NR_RUNNING)
|
#define TSK_RUNNING (1 << NR_RUNNING)
|
||||||
|
#define TSK_ONCPU (1 << NR_ONCPU)
|
||||||
|
|
||||||
/* Resources that workloads could be stalled on */
|
/* Resources that workloads could be stalled on */
|
||||||
enum psi_res {
|
enum psi_res {
|
||||||
|
@ -4091,6 +4091,8 @@ static void __sched notrace __schedule(bool preempt)
|
|||||||
*/
|
*/
|
||||||
++*switch_count;
|
++*switch_count;
|
||||||
|
|
||||||
|
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
|
||||||
|
|
||||||
trace_sched_switch(preempt, prev, next);
|
trace_sched_switch(preempt, prev, next);
|
||||||
|
|
||||||
/* Also unlocks the rq: */
|
/* Also unlocks the rq: */
|
||||||
|
@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
|
|||||||
case PSI_MEM_FULL:
|
case PSI_MEM_FULL:
|
||||||
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
|
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
|
||||||
case PSI_CPU_SOME:
|
case PSI_CPU_SOME:
|
||||||
return tasks[NR_RUNNING] > 1;
|
return tasks[NR_RUNNING] > tasks[NR_ONCPU];
|
||||||
case PSI_NONIDLE:
|
case PSI_NONIDLE:
|
||||||
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
|
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
|
||||||
tasks[NR_RUNNING];
|
tasks[NR_RUNNING];
|
||||||
@ -695,10 +695,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
|
|||||||
if (!(m & (1 << t)))
|
if (!(m & (1 << t)))
|
||||||
continue;
|
continue;
|
||||||
if (groupc->tasks[t] == 0 && !psi_bug) {
|
if (groupc->tasks[t] == 0 && !psi_bug) {
|
||||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
|
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
|
||||||
cpu, t, groupc->tasks[0],
|
cpu, t, groupc->tasks[0],
|
||||||
groupc->tasks[1], groupc->tasks[2],
|
groupc->tasks[1], groupc->tasks[2],
|
||||||
clear, set);
|
groupc->tasks[3], clear, set);
|
||||||
psi_bug = 1;
|
psi_bug = 1;
|
||||||
}
|
}
|
||||||
groupc->tasks[t]--;
|
groupc->tasks[t]--;
|
||||||
@ -916,9 +916,11 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
|
|||||||
|
|
||||||
rq = task_rq_lock(task, &rf);
|
rq = task_rq_lock(task, &rf);
|
||||||
|
|
||||||
if (task_on_rq_queued(task))
|
if (task_on_rq_queued(task)) {
|
||||||
task_flags = TSK_RUNNING;
|
task_flags = TSK_RUNNING;
|
||||||
else if (task->in_iowait)
|
if (task_current(rq, task))
|
||||||
|
task_flags |= TSK_ONCPU;
|
||||||
|
} else if (task->in_iowait)
|
||||||
task_flags = TSK_IOWAIT;
|
task_flags = TSK_IOWAIT;
|
||||||
|
|
||||||
if (task->flags & PF_MEMSTALL)
|
if (task->flags & PF_MEMSTALL)
|
||||||
|
@ -93,6 +93,14 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
|
|||||||
if (p->flags & PF_MEMSTALL)
|
if (p->flags & PF_MEMSTALL)
|
||||||
clear |= TSK_MEMSTALL;
|
clear |= TSK_MEMSTALL;
|
||||||
} else {
|
} else {
|
||||||
|
/*
|
||||||
|
* When a task sleeps, schedule() dequeues it before
|
||||||
|
* switching to the next one. Merge the clearing of
|
||||||
|
* TSK_RUNNING and TSK_ONCPU to save an unnecessary
|
||||||
|
* psi_task_change() call in psi_sched_switch().
|
||||||
|
*/
|
||||||
|
clear |= TSK_ONCPU;
|
||||||
|
|
||||||
if (p->in_iowait)
|
if (p->in_iowait)
|
||||||
set |= TSK_IOWAIT;
|
set |= TSK_IOWAIT;
|
||||||
}
|
}
|
||||||
@ -126,6 +134,23 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void psi_sched_switch(struct task_struct *prev,
|
||||||
|
struct task_struct *next,
|
||||||
|
bool sleep)
|
||||||
|
{
|
||||||
|
if (static_branch_likely(&psi_disabled))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Clear the TSK_ONCPU state if the task was preempted. If
|
||||||
|
* it's a voluntary sleep, dequeue will have taken care of it.
|
||||||
|
*/
|
||||||
|
if (!sleep)
|
||||||
|
psi_task_change(prev, TSK_ONCPU, 0);
|
||||||
|
|
||||||
|
psi_task_change(next, 0, TSK_ONCPU);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void psi_task_tick(struct rq *rq)
|
static inline void psi_task_tick(struct rq *rq)
|
||||||
{
|
{
|
||||||
if (static_branch_likely(&psi_disabled))
|
if (static_branch_likely(&psi_disabled))
|
||||||
@ -138,6 +163,9 @@ static inline void psi_task_tick(struct rq *rq)
|
|||||||
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
|
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
|
||||||
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
|
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
|
||||||
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
|
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
|
||||||
|
static inline void psi_sched_switch(struct task_struct *prev,
|
||||||
|
struct task_struct *next,
|
||||||
|
bool sleep) {}
|
||||||
static inline void psi_task_tick(struct rq *rq) {}
|
static inline void psi_task_tick(struct rq *rq) {}
|
||||||
#endif /* CONFIG_PSI */
|
#endif /* CONFIG_PSI */
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user