sched/psi: Per-cgroup PSI accounting disable/re-enable interface

PSI accounts stalls for each cgroup separately and aggregates it
at each level of the hierarchy. This may cause non-negligible overhead
for some workloads when under deep level of the hierarchy.

commit 3958e2d0c3 ("cgroup: make per-cgroup pressure stall tracking configurable")
make PSI to skip per-cgroup stall accounting, only account system-wide
to avoid this each level overhead.

But for our use case, we also want leaf cgroup PSI stats accounted for
userspace adjustment on that cgroup, apart from only system-wide adjustment.

So this patch introduce a per-cgroup PSI accounting disable/re-enable
interface "cgroup.pressure", which is a read-write single value file that
allowed values are "0" and "1", the defaults is "1" so per-cgroup
PSI stats is enabled by default.

Implementation details:

It should be relatively straight-forward to disable and re-enable
state aggregation, time tracking, averaging on a per-cgroup level,
if we can live with losing history from while it was disabled.
I.e. the avgs will restart from 0, total= will have gaps.

But it's hard or complex to stop/restart groupc->tasks[] updates,
which is not implemented in this patch. So we always update
groupc->tasks[] and PSI_ONCPU bit in psi_group_change() even when
the cgroup PSI stats is disabled.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20220907090332.2078-1-zhouchengming@bytedance.com
This commit is contained in:
Chengming Zhou 2022-09-07 17:03:32 +08:00 committed by Peter Zijlstra
parent dc86aba751
commit 34f26a1561
6 changed files with 152 additions and 13 deletions

View File

@ -976,6 +976,23 @@ All cgroup core files are prefixed with "cgroup."
killing cgroups is a process directed operation, i.e. it affects
the whole thread-group.
cgroup.pressure
A read-write single value file that allowed values are "0" and "1".
The default is "1".
Writing "0" to the file will disable the cgroup PSI accounting.
Writing "1" to the file will re-enable the cgroup PSI accounting.
This control attribute is not hierarchical, so disable or enable PSI
accounting in a cgroup does not affect PSI accounting in descendants
and doesn't need pass enablement via ancestors from root.
The reason this control attribute exists is that PSI accounts stalls for
each cgroup separately and aggregates it at each level of the hierarchy.
This may cause non-negligible overhead for some workloads when under
deep level of the hierarchy, in which case this control attribute can
be used to disable PSI accounting in the non-leaf cgroups.
irq.pressure
A read-write nested-keyed file.

View File

@ -428,6 +428,9 @@ struct cgroup {
struct cgroup_file procs_file; /* handle for "cgroup.procs" */
struct cgroup_file events_file; /* handle for "cgroup.events" */
/* handles for "{cpu,memory,io,irq}.pressure" */
struct cgroup_file psi_files[NR_PSI_RESOURCES];
/*
* The bitmask of subsystems enabled on the child cgroups.
* ->subtree_control is the one configured through

View File

@ -39,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);
void psi_cgroup_restart(struct psi_group *group);
#endif
#else /* CONFIG_PSI */
@ -60,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
{
rcu_assign_pointer(p->cgroups, to);
}
static inline void psi_cgroup_restart(struct psi_group *group) {}
#endif
#endif /* CONFIG_PSI */

View File

@ -152,6 +152,7 @@ struct psi_trigger {
struct psi_group {
struct psi_group *parent;
bool enabled;
/* Protects data used by the aggregator */
struct mutex avgs_lock;
@ -194,6 +195,8 @@ struct psi_group {
#else /* CONFIG_PSI */
#define NR_PSI_RESOURCES 0
struct psi_group { };
#endif /* CONFIG_PSI */

View File

@ -3708,7 +3708,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
return psi_show(seq, psi, PSI_CPU);
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
struct cgroup_file_ctx *ctx = of->priv;
@ -3746,21 +3746,21 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
return pressure_write(of, buf, nbytes, PSI_CPU);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@ -3776,10 +3776,58 @@ static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
return pressure_write(of, buf, nbytes, PSI_IRQ);
}
#endif
static int cgroup_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_psi(cgrp);
seq_printf(seq, "%d\n", psi->enabled);
return 0;
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
ssize_t ret;
int enable;
struct cgroup *cgrp;
struct psi_group *psi;
ret = kstrtoint(strstrip(buf), 0, &enable);
if (ret)
return ret;
if (enable < 0 || enable > 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
psi = cgroup_psi(cgrp);
if (psi->enabled != enable) {
int i;
/* show or hide {cpu,memory,io,irq}.pressure files */
for (i = 0; i < NR_PSI_RESOURCES; i++)
cgroup_file_show(&cgrp->psi_files[i], enable);
psi->enabled = enable;
if (enable)
psi_cgroup_restart(psi);
}
cgroup_kn_unlock(of->kn);
return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
@ -5175,6 +5223,7 @@ static struct cftype cgroup_base_files[] = {
{
.name = "io.pressure",
.flags = CFTYPE_PRESSURE,
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@ -5183,6 +5232,7 @@ static struct cftype cgroup_base_files[] = {
{
.name = "memory.pressure",
.flags = CFTYPE_PRESSURE,
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@ -5191,6 +5241,7 @@ static struct cftype cgroup_base_files[] = {
{
.name = "cpu.pressure",
.flags = CFTYPE_PRESSURE,
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@ -5200,12 +5251,19 @@ static struct cftype cgroup_base_files[] = {
{
.name = "irq.pressure",
.flags = CFTYPE_PRESSURE,
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
#endif
{
.name = "cgroup.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_pressure_show,
.write = cgroup_pressure_write,
},
#endif /* CONFIG_PSI */
{ } /* terminate */
};

View File

@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
{
int cpu;
group->enabled = true;
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
@ -696,17 +697,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
* First we assess the aggregate resource states this CPU's
* tasks have been in since the last change, and account any
* SOME and FULL time these may have resulted in.
*
* Then we update the task counts according to the state
* First we update the task counts according to the state
* change requested through the @clear and @set bits.
*
* Then if the cgroup PSI stats accounting enabled, we
* assess the aggregate resource states this CPU's tasks
* have been in since the last change, and account any
* SOME and FULL time these may have resulted in.
*/
write_seqcount_begin(&groupc->seq);
record_times(groupc, now);
/*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
@ -745,6 +745,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
if (!group->enabled) {
/*
* On the first group change after disabling PSI, conclude
* the current state and flush its time. This is unlikely
* to matter to the user, but aggregation (get_recent_times)
* may have already incorporated the live state into times_prev;
* avoid a delta sample underflow when PSI is later re-enabled.
*/
if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
record_times(groupc, now);
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
return;
}
for (s = 0; s < NR_PSI_STATES; s++) {
if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
state_mask |= (1 << s);
@ -761,6 +778,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
state_mask |= (1 << PSI_MEM_FULL);
record_times(groupc, now);
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
@ -907,6 +926,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
group = task_psi_group(task);
do {
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
write_seqcount_begin(&groupc->seq);
@ -1080,6 +1102,40 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
task_rq_unlock(rq, task, &rf);
}
void psi_cgroup_restart(struct psi_group *group)
{
int cpu;
/*
* After we disable psi_group->enabled, we don't actually
* stop percpu tasks accounting in each psi_group_cpu,
* instead only stop test_state() loop, record_times()
* and averaging worker, see psi_group_change() for details.
*
* When disable cgroup PSI, this function has nothing to sync
* since cgroup pressure files are hidden and percpu psi_group_cpu
* would see !psi_group->enabled and only do task accounting.
*
* When re-enable cgroup PSI, this function use psi_group_change()
* to get correct state mask from test_state() loop on tasks[],
* and restart groupc->state_start from now, use .clear = .set = 0
* here since no task status really changed.
*/
if (!group->enabled)
return;
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
u64 now;
rq_lock_irq(rq, &rf);
now = cpu_clock(cpu);
psi_group_change(group, cpu, 0, 0, now, true);
rq_unlock_irq(rq, &rf);
}
}
#endif /* CONFIG_CGROUPS */
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)