perf evsel: Refactor tool events

Tool events unnecessarily open a dummy perf event which is useless
even with `perf record` which will still open a dummy event. Change
the behavior of tool events so:

 - duration_time - call `rdclock` on open and then report the count as
   a delta since the start in evsel__read_counter. This moves code out
   of builtin-stat making it more general purpose.

 - user_time/system_time - open the fd as either `/proc/pid/stat` or
   `/proc/stat` for cases like system wide. evsel__read_counter will
   read the appropriate field out of the procfs file. These values
   were previously supplied by wait4, if the procfs read fails then
   the wait4 values are used, assuming the process/thread terminated.
   By reading user_time and system_time this way, interval mode, per
   PID and per CPU can be supported although there are restrictions
   given what the files provide (e.g. per PID can't be combined with
   per CPU).

Opening any of the tool events for `perf record` is changed to return
invalid.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Weilin Wang <weilin.wang@intel.com>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: James Clark <james.clark@arm.com>
Cc: Dmitrii Dolgov <9erthalion6@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Cc: Song Liu <song@kernel.org>
Cc: Leo Yan <leo.yan@linux.dev>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240503232849.17752-1-irogers@google.com
This commit is contained in:
Ian Rogers 2024-05-03 16:28:49 -07:00 committed by Namhyung Kim
parent 658a8805cb
commit 6828d6929b
4 changed files with 276 additions and 44 deletions

View File

@ -284,45 +284,38 @@ static int evsel__write_stat_event(struct evsel *counter, int cpu_map_idx, u32 t
process_synthesized_event, NULL);
}
static int read_single_counter(struct evsel *counter, int cpu_map_idx,
int thread, struct timespec *rs)
static int read_single_counter(struct evsel *counter, int cpu_map_idx, int thread)
{
switch(counter->tool_event) {
case PERF_TOOL_DURATION_TIME: {
u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL;
struct perf_counts_values *count =
perf_counts(counter->counts, cpu_map_idx, thread);
count->ena = count->run = val;
count->val = val;
return 0;
}
case PERF_TOOL_USER_TIME:
case PERF_TOOL_SYSTEM_TIME: {
u64 val;
struct perf_counts_values *count =
perf_counts(counter->counts, cpu_map_idx, thread);
if (counter->tool_event == PERF_TOOL_USER_TIME)
val = ru_stats.ru_utime_usec_stat.mean;
else
val = ru_stats.ru_stime_usec_stat.mean;
count->ena = count->run = val;
count->val = val;
return 0;
}
default:
case PERF_TOOL_NONE:
return evsel__read_counter(counter, cpu_map_idx, thread);
case PERF_TOOL_MAX:
/* This should never be reached */
return 0;
int err = evsel__read_counter(counter, cpu_map_idx, thread);
/*
* Reading user and system time will fail when the process
* terminates. Use the wait4 values in that case.
*/
if (err && cpu_map_idx == 0 &&
(counter->tool_event == PERF_TOOL_USER_TIME ||
counter->tool_event == PERF_TOOL_SYSTEM_TIME)) {
u64 val, *start_time;
struct perf_counts_values *count =
perf_counts(counter->counts, cpu_map_idx, thread);
start_time = xyarray__entry(counter->start_times, cpu_map_idx, thread);
if (counter->tool_event == PERF_TOOL_USER_TIME)
val = ru_stats.ru_utime_usec_stat.mean;
else
val = ru_stats.ru_stime_usec_stat.mean;
count->ena = count->run = *start_time + val;
count->val = val;
return 0;
}
return err;
}
/*
* Read out the results of a single counter:
* do not aggregate counts across CPUs in system-wide mode
*/
static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_map_idx)
static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
{
int nthreads = perf_thread_map__nr(evsel_list->core.threads);
int thread;
@ -340,7 +333,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
* (via evsel__read_counter()) and sets their count->loaded.
*/
if (!perf_counts__is_loaded(counter->counts, cpu_map_idx, thread) &&
read_single_counter(counter, cpu_map_idx, thread, rs)) {
read_single_counter(counter, cpu_map_idx, thread)) {
counter->counts->scaled = -1;
perf_counts(counter->counts, cpu_map_idx, thread)->ena = 0;
perf_counts(counter->counts, cpu_map_idx, thread)->run = 0;
@ -369,7 +362,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
return 0;
}
static int read_affinity_counters(struct timespec *rs)
static int read_affinity_counters(void)
{
struct evlist_cpu_iterator evlist_cpu_itr;
struct affinity saved_affinity, *affinity;
@ -390,10 +383,8 @@ static int read_affinity_counters(struct timespec *rs)
if (evsel__is_bpf(counter))
continue;
if (!counter->err) {
counter->err = read_counter_cpu(counter, rs,
evlist_cpu_itr.cpu_map_idx);
}
if (!counter->err)
counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
}
if (affinity)
affinity__cleanup(&saved_affinity);
@ -417,11 +408,11 @@ static int read_bpf_map_counters(void)
return 0;
}
static int read_counters(struct timespec *rs)
static int read_counters(void)
{
if (!stat_config.stop_read_counter) {
if (read_bpf_map_counters() ||
read_affinity_counters(rs))
read_affinity_counters())
return -1;
}
return 0;
@ -452,7 +443,7 @@ static void process_interval(void)
evlist__reset_aggr_stats(evsel_list);
if (read_counters(&rs) == 0)
if (read_counters() == 0)
process_counters();
if (STAT_RECORD) {
@ -940,7 +931,7 @@ try_again_reset:
* avoid arbitrary skew, we must read all counters before closing any
* group leaders.
*/
if (read_counters(&(struct timespec) { .tv_nsec = t1-t0 }) == 0)
if (read_counters() == 0)
process_counters();
/*

View File

@ -10,6 +10,7 @@
#include <errno.h>
#include <inttypes.h>
#include <linux/bitops.h>
#include <api/io.h>
#include <api/fs/fs.h>
#include <api/fs/tracing_path.h>
#include <linux/hw_breakpoint.h>
@ -30,6 +31,7 @@
#include "counts.h"
#include "event.h"
#include "evsel.h"
#include "time-utils.h"
#include "util/env.h"
#include "util/evsel_config.h"
#include "util/evsel_fprintf.h"
@ -1493,6 +1495,9 @@ void evsel__exit(struct evsel *evsel)
evsel->per_pkg_mask = NULL;
zfree(&evsel->metric_events);
perf_evsel__object.fini(evsel);
if (evsel->tool_event == PERF_TOOL_SYSTEM_TIME ||
evsel->tool_event == PERF_TOOL_USER_TIME)
xyarray__delete(evsel->start_times);
}
void evsel__delete(struct evsel *evsel)
@ -1606,11 +1611,173 @@ static int evsel__read_group(struct evsel *leader, int cpu_map_idx, int thread)
return evsel__process_group_data(leader, cpu_map_idx, thread, data);
}
static bool read_until_char(struct io *io, char e)
{
char c;
do {
c = io__get_char(io);
if (c == -1)
return false;
} while (c != e);
return true;
}
static int read_stat_field(int fd, struct perf_cpu cpu, int field, __u64 *val)
{
char buf[256];
struct io io;
int i;
io__init(&io, fd, buf, sizeof(buf));
/* Skip lines to relevant CPU. */
for (i = -1; i < cpu.cpu; i++) {
if (!read_until_char(&io, '\n'))
return -EINVAL;
}
/* Skip to "cpu". */
if (io__get_char(&io) != 'c') return -EINVAL;
if (io__get_char(&io) != 'p') return -EINVAL;
if (io__get_char(&io) != 'u') return -EINVAL;
/* Skip N of cpuN. */
if (!read_until_char(&io, ' '))
return -EINVAL;
i = 1;
while (true) {
if (io__get_dec(&io, val) != ' ')
break;
if (field == i)
return 0;
i++;
}
return -EINVAL;
}
static int read_pid_stat_field(int fd, int field, __u64 *val)
{
char buf[256];
struct io io;
int c, i;
io__init(&io, fd, buf, sizeof(buf));
if (io__get_dec(&io, val) != ' ')
return -EINVAL;
if (field == 1)
return 0;
/* Skip comm. */
if (io__get_char(&io) != '(' || !read_until_char(&io, ')'))
return -EINVAL;
if (field == 2)
return -EINVAL; /* String can't be returned. */
/* Skip state */
if (io__get_char(&io) != ' ' || io__get_char(&io) == -1)
return -EINVAL;
if (field == 3)
return -EINVAL; /* String can't be returned. */
/* Loop over numeric fields*/
if (io__get_char(&io) != ' ')
return -EINVAL;
i = 4;
while (true) {
c = io__get_dec(&io, val);
if (c == -1)
return -EINVAL;
if (c == -2) {
/* Assume a -ve was read */
c = io__get_dec(&io, val);
*val *= -1;
}
if (c != ' ')
return -EINVAL;
if (field == i)
return 0;
i++;
}
return -EINVAL;
}
static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread)
{
__u64 *start_time, cur_time, delta_start;
int fd, err = 0;
struct perf_counts_values *count;
bool adjust = false;
count = perf_counts(evsel->counts, cpu_map_idx, thread);
switch (evsel->tool_event) {
case PERF_TOOL_DURATION_TIME:
/*
* Pretend duration_time is only on the first CPU and thread, or
* else aggregation will scale duration_time by the number of
* CPUs/threads.
*/
start_time = &evsel->start_time;
if (cpu_map_idx == 0 && thread == 0)
cur_time = rdclock();
else
cur_time = *start_time;
break;
case PERF_TOOL_USER_TIME:
case PERF_TOOL_SYSTEM_TIME: {
bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
start_time = xyarray__entry(evsel->start_times, cpu_map_idx, thread);
fd = FD(evsel, cpu_map_idx, thread);
lseek(fd, SEEK_SET, 0);
if (evsel->pid_stat) {
/* The event exists solely on 1 CPU. */
if (cpu_map_idx == 0)
err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
else
cur_time = 0;
} else {
/* The event is for all threads. */
if (thread == 0) {
struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
cpu_map_idx);
err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
} else {
cur_time = 0;
}
}
adjust = true;
break;
}
case PERF_TOOL_NONE:
case PERF_TOOL_MAX:
default:
err = -EINVAL;
}
if (err)
return err;
delta_start = cur_time - *start_time;
if (adjust) {
__u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
delta_start *= 1000000000 / ticks_per_sec;
}
count->val = delta_start;
count->ena = count->run = delta_start;
count->lost = 0;
return 0;
}
int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
{
u64 read_format = evsel->core.attr.read_format;
if (evsel__is_tool(evsel))
return evsel__read_tool(evsel, cpu_map_idx, thread);
if (read_format & PERF_FORMAT_GROUP)
if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
return evsel__read_group(evsel, cpu_map_idx, thread);
return evsel__read_one(evsel, cpu_map_idx, thread);
@ -1829,6 +1996,14 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus,
perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0)
return -ENOMEM;
if ((evsel->tool_event == PERF_TOOL_SYSTEM_TIME ||
evsel->tool_event == PERF_TOOL_USER_TIME) &&
!evsel->start_times) {
evsel->start_times = xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
if (!evsel->start_times)
return -ENOMEM;
}
evsel->open_flags = PERF_FLAG_FD_CLOEXEC;
if (evsel->cgrp)
evsel->open_flags |= PERF_FLAG_PID_CGROUP;
@ -2011,6 +2186,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
int pid = -1, err, old_errno;
enum rlimit_action set_rlimit = NO_CHANGE;
if (evsel->tool_event == PERF_TOOL_DURATION_TIME) {
if (evsel->core.attr.sample_period) /* no sampling */
return -EINVAL;
evsel->start_time = rdclock();
return 0;
}
err = __evsel__prepare_open(evsel, cpus, threads);
if (err)
return err;
@ -2043,6 +2225,46 @@ retry_open:
if (!evsel->cgrp && !evsel->core.system_wide)
pid = perf_thread_map__pid(threads, thread);
if (evsel->tool_event == PERF_TOOL_USER_TIME ||
evsel->tool_event == PERF_TOOL_SYSTEM_TIME) {
bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
__u64 *start_time = NULL;
if (evsel->core.attr.sample_period) {
/* no sampling */
err = -EINVAL;
goto out_close;
}
if (pid > -1) {
char buf[64];
snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
fd = open(buf, O_RDONLY);
evsel->pid_stat = true;
} else {
fd = open("/proc/stat", O_RDONLY);
}
FD(evsel, idx, thread) = fd;
if (fd < 0) {
err = -errno;
goto out_close;
}
start_time = xyarray__entry(evsel->start_times, idx, thread);
if (pid > -1) {
err = read_pid_stat_field(fd, system ? 15 : 14,
start_time);
} else {
struct perf_cpu cpu;
cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
err = read_stat_field(fd, cpu, system ? 3 : 1,
start_time);
}
if (err)
goto out_close;
continue;
}
group_fd = get_group_fd(evsel, idx, thread);
if (group_fd == -2) {

View File

@ -170,6 +170,20 @@ struct evsel {
/* for missing_features */
struct perf_pmu *pmu;
/* For tool events */
/* Beginning time subtracted when the counter is read. */
union {
/* duration_time is a single global time. */
__u64 start_time;
/*
* user_time and system_time read an initial value potentially
* per-CPU or per-pid.
*/
struct xyarray *start_times;
};
/* Is the tool's fd for /proc/pid/stat or /proc/stat. */
bool pid_stat;
};
struct perf_missing_features {

View File

@ -305,11 +305,16 @@ static int add_event_tool(struct list_head *list, int *idx,
.type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_DUMMY,
};
const char *cpu_list = NULL;
if (tool_event == PERF_TOOL_DURATION_TIME) {
/* Duration time is gathered globally, pretend it is only on CPU0. */
cpu_list = "0";
}
evsel = __add_event(list, idx, &attr, /*init_attr=*/true, /*name=*/NULL,
/*metric_id=*/NULL, /*pmu=*/NULL,
/*config_terms=*/NULL, /*auto_merge_stats=*/false,
/*cpu_list=*/"0");
cpu_list);
if (!evsel)
return -ENOMEM;
evsel->tool_event = tool_event;