linux/tools/perf/builtin-ftrace.c
Alexey Budankov 6b3e0e2e04 perf tools: Support CAP_PERFMON capability
Extend error messages to mention CAP_PERFMON capability as an option to
substitute CAP_SYS_ADMIN capability for secure system performance
monitoring and observability operations. Make
perf_event_paranoid_check() and __cmd_ftrace() to be aware of
CAP_PERFMON capability.

CAP_PERFMON implements the principle of least privilege for performance
monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39
principle of least privilege: A security design principle that states
that a process or program be granted only those privileges (e.g.,
capabilities) necessary to accomplish its legitimate function, and only
for the time that such privileges are actually required)

For backward compatibility reasons access to perf_events subsystem remains
open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for
secure perf_events monitoring is discouraged with respect to CAP_PERFMON
capability.

Committer testing:

Using a libcap with this patch:

  diff --git a/libcap/include/uapi/linux/capability.h b/libcap/include/uapi/linux/capability.h
  index 78b2fd4c8a95..89b5b0279b60 100644
  --- a/libcap/include/uapi/linux/capability.h
  +++ b/libcap/include/uapi/linux/capability.h
  @@ -366,8 +366,9 @@ struct vfs_ns_cap_data {

   #define CAP_AUDIT_READ       37

  +#define CAP_PERFMON	     38

  -#define CAP_LAST_CAP         CAP_AUDIT_READ
  +#define CAP_LAST_CAP         CAP_PERFMON

   #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)

Note that using '38' in place of 'cap_perfmon' works to some degree with
an old libcap, its only when cap_get_flag() is called that libcap
performs an error check based on the maximum value known for
capabilities that it will fail.

This makes determining the default of perf_event_attr.exclude_kernel to
fail, as it can't determine if CAP_PERFMON is in place.

Using 'perf top -e cycles' avoids the default check and sets
perf_event_attr.exclude_kernel to 1.

As root, with a libcap supporting CAP_PERFMON:

  # groupadd perf_users
  # adduser perf -g perf_users
  # mkdir ~perf/bin
  # cp ~acme/bin/perf ~perf/bin/
  # chgrp perf_users ~perf/bin/perf
  # setcap "cap_perfmon,cap_sys_ptrace,cap_syslog=ep" ~perf/bin/perf
  # getcap ~perf/bin/perf
  /home/perf/bin/perf = cap_sys_ptrace,cap_syslog,cap_perfmon+ep
  # ls -la ~perf/bin/perf
  -rwxr-xr-x. 1 root perf_users 16968552 Apr  9 13:10 /home/perf/bin/perf

As the 'perf' user in the 'perf_users' group:

  $ perf top -a --stdio
  Error:
  Failed to mmap with 1 (Operation not permitted)
  $

Either add the cap_ipc_lock capability to the perf binary or reduce the
ring buffer size to some smaller value:

  $ perf top -m10 -a --stdio
  rounding mmap pages size to 64K (16 pages)
  Error:
  Failed to mmap with 1 (Operation not permitted)
  $ perf top -m4 -a --stdio
  Error:
  Failed to mmap with 1 (Operation not permitted)
  $ perf top -m2 -a --stdio
   PerfTop: 762 irqs/sec  kernel:49.7%  exact: 100.0% lost: 0/0 drop: 0/0 [4000Hz cycles], (all, 4 CPUs)
  ------------------------------------------------------------------------------------------------------

     9.83%  perf                [.] __symbols__insert
     8.58%  perf                [.] rb_next
     5.91%  [kernel]            [k] module_get_kallsym
     5.66%  [kernel]            [k] kallsyms_expand_symbol.constprop.0
     3.98%  libc-2.29.so        [.] __GI_____strtoull_l_internal
     3.66%  perf                [.] rb_insert_color
     2.34%  [kernel]            [k] vsnprintf
     2.30%  [kernel]            [k] string_nocheck
     2.16%  libc-2.29.so        [.] _IO_getdelim
     2.15%  [kernel]            [k] number
     2.13%  [kernel]            [k] format_decode
     1.58%  libc-2.29.so        [.] _IO_feof
     1.52%  libc-2.29.so        [.] __strcmp_avx2
     1.50%  perf                [.] rb_set_parent_color
     1.47%  libc-2.29.so        [.] __libc_calloc
     1.24%  [kernel]            [k] do_syscall_64
     1.17%  [kernel]            [k] __x86_indirect_thunk_rax

  $ perf record -a sleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.552 MB perf.data (74 samples) ]
  $ perf evlist
  cycles
  $ perf evlist -v
  cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
  $ perf report | head -20
  # To display the perf.data header info, please use --header/--header-only options.
  #
  #
  # Total Lost Samples: 0
  #
  # Samples: 74  of event 'cycles'
  # Event count (approx.): 15694834
  #
  # Overhead  Command          Shared Object               Symbol
  # ........  ...............  ..........................  ......................................
  #
      19.62%  perf             [kernel.vmlinux]            [k] strnlen_user
      13.88%  swapper          [kernel.vmlinux]            [k] intel_idle
      13.83%  ksoftirqd/0      [kernel.vmlinux]            [k] pfifo_fast_dequeue
      13.51%  swapper          [kernel.vmlinux]            [k] kmem_cache_free
       6.31%  gnome-shell      [kernel.vmlinux]            [k] kmem_cache_free
       5.66%  kworker/u8:3+ix  [kernel.vmlinux]            [k] delay_tsc
       4.42%  perf             [kernel.vmlinux]            [k] __set_cpus_allowed_ptr
       3.45%  kworker/2:1-eve  [kernel.vmlinux]            [k] shmem_truncate_range
       2.29%  gnome-shell      libgobject-2.0.so.0.6000.7  [.] g_closure_ref
  $

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Igor Lubashev <ilubashe@akamai.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: intel-gfx@lists.freedesktop.org
Cc: linux-doc@vger.kernel.org
Cc: linux-man@vger.kernel.org
Cc: linux-security-module@vger.kernel.org
Cc: selinux@vger.kernel.org
Link: http://lore.kernel.org/lkml/a66d5648-2b8e-577e-e1f2-1d56c017ab5e@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-04-16 12:19:08 -03:00

531 lines
12 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* builtin-ftrace.c
*
* Copyright (c) 2013 LG Electronics, Namhyung Kim <namhyung@kernel.org>
*/
#include "builtin.h"
#include <errno.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
#include <fcntl.h>
#include <poll.h>
#include <linux/capability.h>
#include <linux/string.h>
#include "debug.h"
#include <subcmd/pager.h>
#include <subcmd/parse-options.h>
#include <api/fs/tracing_path.h>
#include "evlist.h"
#include "target.h"
#include "cpumap.h"
#include "thread_map.h"
#include "util/cap.h"
#include "util/config.h"
#define DEFAULT_TRACER "function_graph"
struct perf_ftrace {
struct evlist *evlist;
struct target target;
const char *tracer;
struct list_head filters;
struct list_head notrace;
struct list_head graph_funcs;
struct list_head nograph_funcs;
int graph_depth;
};
struct filter_entry {
struct list_head list;
char name[];
};
static bool done;
static void sig_handler(int sig __maybe_unused)
{
done = true;
}
/*
* perf_evlist__prepare_workload will send a SIGUSR1 if the fork fails, since
* we asked by setting its exec_error to the function below,
* ftrace__workload_exec_failed_signal.
*
* XXX We need to handle this more appropriately, emitting an error, etc.
*/
static void ftrace__workload_exec_failed_signal(int signo __maybe_unused,
siginfo_t *info __maybe_unused,
void *ucontext __maybe_unused)
{
/* workload_exec_errno = info->si_value.sival_int; */
done = true;
}
static int __write_tracing_file(const char *name, const char *val, bool append)
{
char *file;
int fd, ret = -1;
ssize_t size = strlen(val);
int flags = O_WRONLY;
char errbuf[512];
char *val_copy;
file = get_tracing_file(name);
if (!file) {
pr_debug("cannot get tracing file: %s\n", name);
return -1;
}
if (append)
flags |= O_APPEND;
else
flags |= O_TRUNC;
fd = open(file, flags);
if (fd < 0) {
pr_debug("cannot open tracing file: %s: %s\n",
name, str_error_r(errno, errbuf, sizeof(errbuf)));
goto out;
}
/*
* Copy the original value and append a '\n'. Without this,
* the kernel can hide possible errors.
*/
val_copy = strdup(val);
if (!val_copy)
goto out_close;
val_copy[size] = '\n';
if (write(fd, val_copy, size + 1) == size + 1)
ret = 0;
else
pr_debug("write '%s' to tracing/%s failed: %s\n",
val, name, str_error_r(errno, errbuf, sizeof(errbuf)));
free(val_copy);
out_close:
close(fd);
out:
put_tracing_file(file);
return ret;
}
static int write_tracing_file(const char *name, const char *val)
{
return __write_tracing_file(name, val, false);
}
static int append_tracing_file(const char *name, const char *val)
{
return __write_tracing_file(name, val, true);
}
static int reset_tracing_cpu(void);
static void reset_tracing_filters(void);
static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused)
{
if (write_tracing_file("tracing_on", "0") < 0)
return -1;
if (write_tracing_file("current_tracer", "nop") < 0)
return -1;
if (write_tracing_file("set_ftrace_pid", " ") < 0)
return -1;
if (reset_tracing_cpu() < 0)
return -1;
if (write_tracing_file("max_graph_depth", "0") < 0)
return -1;
reset_tracing_filters();
return 0;
}
static int set_tracing_pid(struct perf_ftrace *ftrace)
{
int i;
char buf[16];
if (target__has_cpu(&ftrace->target))
return 0;
for (i = 0; i < perf_thread_map__nr(ftrace->evlist->core.threads); i++) {
scnprintf(buf, sizeof(buf), "%d",
ftrace->evlist->core.threads->map[i]);
if (append_tracing_file("set_ftrace_pid", buf) < 0)
return -1;
}
return 0;
}
static int set_tracing_cpumask(struct perf_cpu_map *cpumap)
{
char *cpumask;
size_t mask_size;
int ret;
int last_cpu;
last_cpu = cpu_map__cpu(cpumap, cpumap->nr - 1);
mask_size = last_cpu / 4 + 2; /* one more byte for EOS */
mask_size += last_cpu / 32; /* ',' is needed for every 32th cpus */
cpumask = malloc(mask_size);
if (cpumask == NULL) {
pr_debug("failed to allocate cpu mask\n");
return -1;
}
cpu_map__snprint_mask(cpumap, cpumask, mask_size);
ret = write_tracing_file("tracing_cpumask", cpumask);
free(cpumask);
return ret;
}
static int set_tracing_cpu(struct perf_ftrace *ftrace)
{
struct perf_cpu_map *cpumap = ftrace->evlist->core.cpus;
if (!target__has_cpu(&ftrace->target))
return 0;
return set_tracing_cpumask(cpumap);
}
static int reset_tracing_cpu(void)
{
struct perf_cpu_map *cpumap = perf_cpu_map__new(NULL);
int ret;
ret = set_tracing_cpumask(cpumap);
perf_cpu_map__put(cpumap);
return ret;
}
static int __set_tracing_filter(const char *filter_file, struct list_head *funcs)
{
struct filter_entry *pos;
list_for_each_entry(pos, funcs, list) {
if (append_tracing_file(filter_file, pos->name) < 0)
return -1;
}
return 0;
}
static int set_tracing_filters(struct perf_ftrace *ftrace)
{
int ret;
ret = __set_tracing_filter("set_ftrace_filter", &ftrace->filters);
if (ret < 0)
return ret;
ret = __set_tracing_filter("set_ftrace_notrace", &ftrace->notrace);
if (ret < 0)
return ret;
ret = __set_tracing_filter("set_graph_function", &ftrace->graph_funcs);
if (ret < 0)
return ret;
/* old kernels do not have this filter */
__set_tracing_filter("set_graph_notrace", &ftrace->nograph_funcs);
return ret;
}
static void reset_tracing_filters(void)
{
write_tracing_file("set_ftrace_filter", " ");
write_tracing_file("set_ftrace_notrace", " ");
write_tracing_file("set_graph_function", " ");
write_tracing_file("set_graph_notrace", " ");
}
static int set_tracing_depth(struct perf_ftrace *ftrace)
{
char buf[16];
if (ftrace->graph_depth == 0)
return 0;
if (ftrace->graph_depth < 0) {
pr_err("invalid graph depth: %d\n", ftrace->graph_depth);
return -1;
}
snprintf(buf, sizeof(buf), "%d", ftrace->graph_depth);
if (write_tracing_file("max_graph_depth", buf) < 0)
return -1;
return 0;
}
static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
{
char *trace_file;
int trace_fd;
char buf[4096];
struct pollfd pollfd = {
.events = POLLIN,
};
if (!(perf_cap__capable(CAP_PERFMON) ||
perf_cap__capable(CAP_SYS_ADMIN))) {
pr_err("ftrace only works for %s!\n",
#ifdef HAVE_LIBCAP_SUPPORT
"users with the CAP_PERFMON or CAP_SYS_ADMIN capability"
#else
"root"
#endif
);
return -1;
}
signal(SIGINT, sig_handler);
signal(SIGUSR1, sig_handler);
signal(SIGCHLD, sig_handler);
signal(SIGPIPE, sig_handler);
if (reset_tracing_files(ftrace) < 0) {
pr_err("failed to reset ftrace\n");
goto out;
}
/* reset ftrace buffer */
if (write_tracing_file("trace", "0") < 0)
goto out;
if (argc && perf_evlist__prepare_workload(ftrace->evlist,
&ftrace->target, argv, false,
ftrace__workload_exec_failed_signal) < 0) {
goto out;
}
if (set_tracing_pid(ftrace) < 0) {
pr_err("failed to set ftrace pid\n");
goto out_reset;
}
if (set_tracing_cpu(ftrace) < 0) {
pr_err("failed to set tracing cpumask\n");
goto out_reset;
}
if (set_tracing_filters(ftrace) < 0) {
pr_err("failed to set tracing filters\n");
goto out_reset;
}
if (set_tracing_depth(ftrace) < 0) {
pr_err("failed to set graph depth\n");
goto out_reset;
}
if (write_tracing_file("current_tracer", ftrace->tracer) < 0) {
pr_err("failed to set current_tracer to %s\n", ftrace->tracer);
goto out_reset;
}
setup_pager();
trace_file = get_tracing_file("trace_pipe");
if (!trace_file) {
pr_err("failed to open trace_pipe\n");
goto out_reset;
}
trace_fd = open(trace_file, O_RDONLY);
put_tracing_file(trace_file);
if (trace_fd < 0) {
pr_err("failed to open trace_pipe\n");
goto out_reset;
}
fcntl(trace_fd, F_SETFL, O_NONBLOCK);
pollfd.fd = trace_fd;
if (write_tracing_file("tracing_on", "1") < 0) {
pr_err("can't enable tracing\n");
goto out_close_fd;
}
perf_evlist__start_workload(ftrace->evlist);
while (!done) {
if (poll(&pollfd, 1, -1) < 0)
break;
if (pollfd.revents & POLLIN) {
int n = read(trace_fd, buf, sizeof(buf));
if (n < 0)
break;
if (fwrite(buf, n, 1, stdout) != 1)
break;
}
}
write_tracing_file("tracing_on", "0");
/* read remaining buffer contents */
while (true) {
int n = read(trace_fd, buf, sizeof(buf));
if (n <= 0)
break;
if (fwrite(buf, n, 1, stdout) != 1)
break;
}
out_close_fd:
close(trace_fd);
out_reset:
reset_tracing_files(ftrace);
out:
return done ? 0 : -1;
}
static int perf_ftrace_config(const char *var, const char *value, void *cb)
{
struct perf_ftrace *ftrace = cb;
if (!strstarts(var, "ftrace."))
return 0;
if (strcmp(var, "ftrace.tracer"))
return -1;
if (!strcmp(value, "function_graph") ||
!strcmp(value, "function")) {
ftrace->tracer = value;
return 0;
}
pr_err("Please select \"function_graph\" (default) or \"function\"\n");
return -1;
}
static int parse_filter_func(const struct option *opt, const char *str,
int unset __maybe_unused)
{
struct list_head *head = opt->value;
struct filter_entry *entry;
entry = malloc(sizeof(*entry) + strlen(str) + 1);
if (entry == NULL)
return -ENOMEM;
strcpy(entry->name, str);
list_add_tail(&entry->list, head);
return 0;
}
static void delete_filter_func(struct list_head *head)
{
struct filter_entry *pos, *tmp;
list_for_each_entry_safe(pos, tmp, head, list) {
list_del_init(&pos->list);
free(pos);
}
}
int cmd_ftrace(int argc, const char **argv)
{
int ret;
struct perf_ftrace ftrace = {
.tracer = DEFAULT_TRACER,
.target = { .uid = UINT_MAX, },
};
const char * const ftrace_usage[] = {
"perf ftrace [<options>] [<command>]",
"perf ftrace [<options>] -- <command> [<options>]",
NULL
};
const struct option ftrace_options[] = {
OPT_STRING('t', "tracer", &ftrace.tracer, "tracer",
"tracer to use: function_graph(default) or function"),
OPT_STRING('p', "pid", &ftrace.target.pid, "pid",
"trace on existing process id"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose"),
OPT_BOOLEAN('a', "all-cpus", &ftrace.target.system_wide,
"system-wide collection from all CPUs"),
OPT_STRING('C', "cpu", &ftrace.target.cpu_list, "cpu",
"list of cpus to monitor"),
OPT_CALLBACK('T', "trace-funcs", &ftrace.filters, "func",
"trace given functions only", parse_filter_func),
OPT_CALLBACK('N', "notrace-funcs", &ftrace.notrace, "func",
"do not trace given functions", parse_filter_func),
OPT_CALLBACK('G', "graph-funcs", &ftrace.graph_funcs, "func",
"Set graph filter on given functions", parse_filter_func),
OPT_CALLBACK('g', "nograph-funcs", &ftrace.nograph_funcs, "func",
"Set nograph filter on given functions", parse_filter_func),
OPT_INTEGER('D', "graph-depth", &ftrace.graph_depth,
"Max depth for function graph tracer"),
OPT_END()
};
INIT_LIST_HEAD(&ftrace.filters);
INIT_LIST_HEAD(&ftrace.notrace);
INIT_LIST_HEAD(&ftrace.graph_funcs);
INIT_LIST_HEAD(&ftrace.nograph_funcs);
ret = perf_config(perf_ftrace_config, &ftrace);
if (ret < 0)
return -1;
argc = parse_options(argc, argv, ftrace_options, ftrace_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
if (!argc && target__none(&ftrace.target))
usage_with_options(ftrace_usage, ftrace_options);
ret = target__validate(&ftrace.target);
if (ret) {
char errbuf[512];
target__strerror(&ftrace.target, ret, errbuf, 512);
pr_err("%s\n", errbuf);
goto out_delete_filters;
}
ftrace.evlist = evlist__new();
if (ftrace.evlist == NULL) {
ret = -ENOMEM;
goto out_delete_filters;
}
ret = perf_evlist__create_maps(ftrace.evlist, &ftrace.target);
if (ret < 0)
goto out_delete_evlist;
ret = __cmd_ftrace(&ftrace, argc, argv);
out_delete_evlist:
evlist__delete(ftrace.evlist);
out_delete_filters:
delete_filter_func(&ftrace.filters);
delete_filter_func(&ftrace.notrace);
delete_filter_func(&ftrace.graph_funcs);
delete_filter_func(&ftrace.nograph_funcs);
return ret;
}