linux/kernel/sched/stats.c
Yafang Shao 847fc0cd06 sched: Introduce task block time in schedstats
Currently in schedstats we have sum_sleep_runtime and iowait_sum, but
there's no metric to show how long the task is in D state.  Once a task in
D state, it means the task is blocked in the kernel, for example the
task may be waiting for a mutex. The D state is more frequent than
iowait, and it is more critital than S state. So it is worth to add a
metric to measure it.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210905143547.4668-5-laoar.shao@gmail.com
2021-10-05 15:51:48 +02:00

233 lines
5.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* /proc/schedstat implementation
*/
#include "sched.h"
void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
{
u64 wait_start, prev_wait_start;
wait_start = rq_clock(rq);
prev_wait_start = schedstat_val(stats->wait_start);
if (p && likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start;
__schedstat_set(stats->wait_start, wait_start);
}
void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
{
u64 delta = rq_clock(rq) - schedstat_val(stats->wait_start);
if (p) {
if (task_on_rq_migrating(p)) {
/*
* Preserve migrating task's wait time so wait_start
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
__schedstat_set(stats->wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
__schedstat_set(stats->wait_max,
max(schedstat_val(stats->wait_max), delta));
__schedstat_inc(stats->wait_count);
__schedstat_add(stats->wait_sum, delta);
__schedstat_set(stats->wait_start, 0);
}
void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
{
u64 sleep_start, block_start;
sleep_start = schedstat_val(stats->sleep_start);
block_start = schedstat_val(stats->block_start);
if (sleep_start) {
u64 delta = rq_clock(rq) - sleep_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > schedstat_val(stats->sleep_max)))
__schedstat_set(stats->sleep_max, delta);
__schedstat_set(stats->sleep_start, 0);
__schedstat_add(stats->sum_sleep_runtime, delta);
if (p) {
account_scheduler_latency(p, delta >> 10, 1);
trace_sched_stat_sleep(p, delta);
}
}
if (block_start) {
u64 delta = rq_clock(rq) - block_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > schedstat_val(stats->block_max)))
__schedstat_set(stats->block_max, delta);
__schedstat_set(stats->block_start, 0);
__schedstat_add(stats->sum_sleep_runtime, delta);
__schedstat_add(stats->sum_block_runtime, delta);
if (p) {
if (p->in_iowait) {
__schedstat_add(stats->iowait_sum, delta);
__schedstat_inc(stats->iowait_count);
trace_sched_stat_iowait(p, delta);
}
trace_sched_stat_blocked(p, delta);
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(p),
delta >> 20);
}
account_scheduler_latency(p, delta >> 10, 0);
}
}
}
/*
* Current schedstat API version.
*
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
#define SCHEDSTAT_VERSION 15
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
if (v == (void *)1) {
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
seq_printf(seq, "timestamp %lu\n", jiffies);
} else {
struct rq *rq;
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
#endif
cpu = (unsigned long)(v - 2);
rq = cpu_rq(cpu);
/* runqueue-specific stats */
seq_printf(seq,
"cpu%d %u 0 %u %u %u %u %llu %llu %lu",
cpu, rq->yld_count,
rq->sched_count, rq->sched_goidle,
rq->ttwu_count, rq->ttwu_local,
rq->rq_cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
seq_printf(seq, "domain%d %*pb", dcount++,
cpumask_pr_args(sched_domain_span(sd)));
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
itype++) {
seq_printf(seq, " %u %u %u %u %u %u %u %u",
sd->lb_count[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
sd->lb_imbalance[itype],
sd->lb_gained[itype],
sd->lb_hot_gained[itype],
sd->lb_nobusyq[itype],
sd->lb_nobusyg[itype]);
}
seq_printf(seq,
" %u %u %u %u %u %u %u %u %u %u %u %u\n",
sd->alb_count, sd->alb_failed, sd->alb_pushed,
sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
}
rcu_read_unlock();
#endif
}
return 0;
}
/*
* This iterator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
* In a hotplugged system some CPUs, including cpu 0, may be missing so we have
* to use cpumask_* to iterate over the CPUs.
*/
static void *schedstat_start(struct seq_file *file, loff_t *offset)
{
unsigned long n = *offset;
if (n == 0)
return (void *) 1;
n--;
if (n > 0)
n = cpumask_next(n - 1, cpu_online_mask);
else
n = cpumask_first(cpu_online_mask);
*offset = n + 1;
if (n < nr_cpu_ids)
return (void *)(unsigned long)(n + 2);
return NULL;
}
static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
{
(*offset)++;
return schedstat_start(file, offset);
}
static void schedstat_stop(struct seq_file *file, void *data)
{
}
static const struct seq_operations schedstat_sops = {
.start = schedstat_start,
.next = schedstat_next,
.stop = schedstat_stop,
.show = show_schedstat,
};
static int __init proc_schedstat_init(void)
{
proc_create_seq("schedstat", 0, NULL, &schedstat_sops);
return 0;
}
subsys_initcall(proc_schedstat_init);