linux/samples/bpf/bpf_load.c
Jesper Dangaard Brouer 156450d9d9 samples/bpf: make bpf_load.c code compatible with ELF maps section changes
This patch does proper parsing of the ELF "maps" section, in-order to
be both backwards and forwards compatible with changes to the map
definition struct bpf_map_def, which gets compiled into the ELF file.

The assumption is that new features with value zero, means that they
are not in-use.  For backward compatibility where loading an ELF file
with a smaller struct bpf_map_def, only copy objects ELF size, leaving
rest of loaders struct zero.  For forward compatibility where ELF file
have a larger struct bpf_map_def, only copy loaders own struct size
and verify that rest of the larger struct is zero, assuming this means
the newer feature was not activated, thus it should be safe for this
older loader to load this newer ELF file.

Fixes: fb30d4b712 ("bpf: Add tests for map-in-map")
Fixes: 409526bea3c3 ("samples/bpf: bpf_load.c detect and abort if ELF maps section size is wrong")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-03 09:30:24 -04:00

777 lines
18 KiB
C

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <libelf.h>
#include <gelf.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <stdbool.h>
#include <stdlib.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/types.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <poll.h>
#include <ctype.h>
#include <assert.h>
#include "libbpf.h"
#include "bpf_load.h"
#include "perf-sys.h"
#define DEBUGFS "/sys/kernel/debug/tracing/"
static char license[128];
static int kern_version;
static bool processed_sec[128];
char bpf_log_buf[BPF_LOG_BUF_SIZE];
int map_fd[MAX_MAPS];
int prog_fd[MAX_PROGS];
int event_fd[MAX_PROGS];
int prog_cnt;
int prog_array_fd = -1;
/* Keeping relevant info on maps */
struct bpf_map_data {
int fd;
char *name;
size_t elf_offset;
struct bpf_map_def def;
};
struct bpf_map_data map_data[MAX_MAPS];
int map_data_count = 0;
static int populate_prog_array(const char *event, int prog_fd)
{
int ind = atoi(event), err;
err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
if (err < 0) {
printf("failed to store prog_fd in prog_array\n");
return -1;
}
return 0;
}
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
bool is_socket = strncmp(event, "socket", 6) == 0;
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
bool is_xdp = strncmp(event, "xdp", 3) == 0;
bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
size_t insns_cnt = size / sizeof(struct bpf_insn);
enum bpf_prog_type prog_type;
char buf[256];
int fd, efd, err, id;
struct perf_event_attr attr = {};
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
if (is_socket) {
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
} else if (is_kprobe || is_kretprobe) {
prog_type = BPF_PROG_TYPE_KPROBE;
} else if (is_tracepoint) {
prog_type = BPF_PROG_TYPE_TRACEPOINT;
} else if (is_xdp) {
prog_type = BPF_PROG_TYPE_XDP;
} else if (is_perf_event) {
prog_type = BPF_PROG_TYPE_PERF_EVENT;
} else if (is_cgroup_skb) {
prog_type = BPF_PROG_TYPE_CGROUP_SKB;
} else if (is_cgroup_sk) {
prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
} else {
printf("Unknown event '%s'\n", event);
return -1;
}
fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
bpf_log_buf, BPF_LOG_BUF_SIZE);
if (fd < 0) {
printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
return -1;
}
prog_fd[prog_cnt++] = fd;
if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
return 0;
if (is_socket) {
event += 6;
if (*event != '/')
return 0;
event++;
if (!isdigit(*event)) {
printf("invalid prog number\n");
return -1;
}
return populate_prog_array(event, fd);
}
if (is_kprobe || is_kretprobe) {
if (is_kprobe)
event += 7;
else
event += 10;
if (*event == 0) {
printf("event name cannot be empty\n");
return -1;
}
if (isdigit(*event))
return populate_prog_array(event, fd);
snprintf(buf, sizeof(buf),
"echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
is_kprobe ? 'p' : 'r', event, event);
err = system(buf);
if (err < 0) {
printf("failed to create kprobe '%s' error '%s'\n",
event, strerror(errno));
return -1;
}
strcpy(buf, DEBUGFS);
strcat(buf, "events/kprobes/");
strcat(buf, event);
strcat(buf, "/id");
} else if (is_tracepoint) {
event += 11;
if (*event == 0) {
printf("event name cannot be empty\n");
return -1;
}
strcpy(buf, DEBUGFS);
strcat(buf, "events/");
strcat(buf, event);
strcat(buf, "/id");
}
efd = open(buf, O_RDONLY, 0);
if (efd < 0) {
printf("failed to open event %s\n", event);
return -1;
}
err = read(efd, buf, sizeof(buf));
if (err < 0 || err >= sizeof(buf)) {
printf("read from '%s' failed '%s'\n", event, strerror(errno));
return -1;
}
close(efd);
buf[err] = 0;
id = atoi(buf);
attr.config = id;
efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
if (efd < 0) {
printf("event %d fd %d err %s\n", id, efd, strerror(errno));
return -1;
}
event_fd[prog_cnt - 1] = efd;
ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
return 0;
}
static int load_maps(struct bpf_map_data *maps, int nr_maps,
fixup_map_cb fixup_map)
{
int i;
for (i = 0; i < nr_maps; i++) {
if (fixup_map)
fixup_map(&maps[i].def, maps[i].name, i);
if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
map_fd[i] = bpf_create_map_in_map(maps[i].def.type,
maps[i].def.key_size,
inner_map_fd,
maps[i].def.max_entries,
maps[i].def.map_flags);
} else {
map_fd[i] = bpf_create_map(maps[i].def.type,
maps[i].def.key_size,
maps[i].def.value_size,
maps[i].def.max_entries,
maps[i].def.map_flags);
}
if (map_fd[i] < 0) {
printf("failed to create a map: %d %s\n",
errno, strerror(errno));
return 1;
}
maps[i].fd = map_fd[i];
if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
prog_array_fd = map_fd[i];
}
return 0;
}
static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
GElf_Shdr *shdr, Elf_Data **data)
{
Elf_Scn *scn;
scn = elf_getscn(elf, i);
if (!scn)
return 1;
if (gelf_getshdr(scn, shdr) != shdr)
return 2;
*shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
if (!*shname || !shdr->sh_size)
return 3;
*data = elf_getdata(scn, 0);
if (!*data || elf_getdata(scn, *data) != NULL)
return 4;
return 0;
}
static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
GElf_Shdr *shdr, struct bpf_insn *insn,
struct bpf_map_data *maps, int nr_maps)
{
int i, nrels;
nrels = shdr->sh_size / shdr->sh_entsize;
for (i = 0; i < nrels; i++) {
GElf_Sym sym;
GElf_Rel rel;
unsigned int insn_idx;
bool match = false;
int j, map_idx;
gelf_getrel(data, i, &rel);
insn_idx = rel.r_offset / sizeof(struct bpf_insn);
gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
printf("invalid relo for insn[%d].code 0x%x\n",
insn_idx, insn[insn_idx].code);
return 1;
}
insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
/* Match FD relocation against recorded map_data[] offset */
for (map_idx = 0; map_idx < nr_maps; map_idx++) {
if (maps[map_idx].elf_offset == sym.st_value) {
match = true;
break;
}
}
if (match) {
insn[insn_idx].imm = maps[map_idx].fd;
} else {
printf("invalid relo for insn[%d] no map_data match\n",
insn_idx);
return 1;
}
}
return 0;
}
static int cmp_symbols(const void *l, const void *r)
{
const GElf_Sym *lsym = (const GElf_Sym *)l;
const GElf_Sym *rsym = (const GElf_Sym *)r;
if (lsym->st_value < rsym->st_value)
return -1;
else if (lsym->st_value > rsym->st_value)
return 1;
else
return 0;
}
static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
Elf *elf, Elf_Data *symbols, int strtabidx)
{
int map_sz_elf, map_sz_copy;
bool validate_zero = false;
Elf_Data *data_maps;
int i, nr_maps;
GElf_Sym *sym;
Elf_Scn *scn;
int copy_sz;
if (maps_shndx < 0)
return -EINVAL;
if (!symbols)
return -EINVAL;
/* Get data for maps section via elf index */
scn = elf_getscn(elf, maps_shndx);
if (scn)
data_maps = elf_getdata(scn, NULL);
if (!scn || !data_maps) {
printf("Failed to get Elf_Data from maps section %d\n",
maps_shndx);
return -EINVAL;
}
/* For each map get corrosponding symbol table entry */
sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
assert(nr_maps < MAX_MAPS+1);
if (!gelf_getsym(symbols, i, &sym[nr_maps]))
continue;
if (sym[nr_maps].st_shndx != maps_shndx)
continue;
/* Only increment iif maps section */
nr_maps++;
}
/* Align to map_fd[] order, via sort on offset in sym.st_value */
qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
/* Keeping compatible with ELF maps section changes
* ------------------------------------------------
* The program size of struct bpf_map_def is known by loader
* code, but struct stored in ELF file can be different.
*
* Unfortunately sym[i].st_size is zero. To calculate the
* struct size stored in the ELF file, assume all struct have
* the same size, and simply divide with number of map
* symbols.
*/
map_sz_elf = data_maps->d_size / nr_maps;
map_sz_copy = sizeof(struct bpf_map_def);
if (map_sz_elf < map_sz_copy) {
/*
* Backward compat, loading older ELF file with
* smaller struct, keeping remaining bytes zero.
*/
map_sz_copy = map_sz_elf;
} else if (map_sz_elf > map_sz_copy) {
/*
* Forward compat, loading newer ELF file with larger
* struct with unknown features. Assume zero means
* feature not used. Thus, validate rest of struct
* data is zero.
*/
validate_zero = true;
}
/* Memcpy relevant part of ELF maps data to loader maps */
for (i = 0; i < nr_maps; i++) {
unsigned char *addr, *end;
struct bpf_map_def *def;
const char *map_name;
size_t offset;
map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
maps[i].name = strdup(map_name);
if (!maps[i].name) {
printf("strdup(%s): %s(%d)\n", map_name,
strerror(errno), errno);
free(sym);
return -errno;
}
/* Symbol value is offset into ELF maps section data area */
offset = sym[i].st_value;
def = (struct bpf_map_def *)(data_maps->d_buf + offset);
maps[i].elf_offset = offset;
memset(&maps[i].def, 0, sizeof(struct bpf_map_def));
memcpy(&maps[i].def, def, map_sz_copy);
/* Verify no newer features were requested */
if (validate_zero) {
addr = (unsigned char*) def + map_sz_copy;
end = (unsigned char*) def + map_sz_elf;
for (; addr < end; addr++) {
if (*addr != 0) {
free(sym);
return -EFBIG;
}
}
}
}
free(sym);
return nr_maps;
}
static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
{
int fd, i, ret, maps_shndx = -1, strtabidx = -1;
Elf *elf;
GElf_Ehdr ehdr;
GElf_Shdr shdr, shdr_prog;
Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
char *shname, *shname_prog;
int nr_maps = 0;
/* reset global variables */
kern_version = 0;
memset(license, 0, sizeof(license));
memset(processed_sec, 0, sizeof(processed_sec));
if (elf_version(EV_CURRENT) == EV_NONE)
return 1;
fd = open(path, O_RDONLY, 0);
if (fd < 0)
return 1;
elf = elf_begin(fd, ELF_C_READ, NULL);
if (!elf)
return 1;
if (gelf_getehdr(elf, &ehdr) != &ehdr)
return 1;
/* clear all kprobes */
i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
/* scan over all elf sections to get license and map info */
for (i = 1; i < ehdr.e_shnum; i++) {
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (0) /* helpful for llvm debugging */
printf("section %d:%s data %p size %zd link %d flags %d\n",
i, shname, data->d_buf, data->d_size,
shdr.sh_link, (int) shdr.sh_flags);
if (strcmp(shname, "license") == 0) {
processed_sec[i] = true;
memcpy(license, data->d_buf, data->d_size);
} else if (strcmp(shname, "version") == 0) {
processed_sec[i] = true;
if (data->d_size != sizeof(int)) {
printf("invalid size of version section %zd\n",
data->d_size);
return 1;
}
memcpy(&kern_version, data->d_buf, sizeof(int));
} else if (strcmp(shname, "maps") == 0) {
int j;
maps_shndx = i;
data_maps = data;
for (j = 0; j < MAX_MAPS; j++)
map_data[j].fd = -1;
} else if (shdr.sh_type == SHT_SYMTAB) {
strtabidx = shdr.sh_link;
symbols = data;
}
}
ret = 1;
if (!symbols) {
printf("missing SHT_SYMTAB section\n");
goto done;
}
if (data_maps) {
nr_maps = load_elf_maps_section(map_data, maps_shndx,
elf, symbols, strtabidx);
if (nr_maps < 0) {
printf("Error: Failed loading ELF maps (errno:%d):%s\n",
nr_maps, strerror(-nr_maps));
ret = 1;
goto done;
}
if (load_maps(map_data, nr_maps, fixup_map))
goto done;
map_data_count = nr_maps;
processed_sec[maps_shndx] = true;
}
/* load programs that need map fixup (relocations) */
for (i = 1; i < ehdr.e_shnum; i++) {
if (processed_sec[i])
continue;
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (shdr.sh_type == SHT_REL) {
struct bpf_insn *insns;
if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
&shdr_prog, &data_prog))
continue;
if (shdr_prog.sh_type != SHT_PROGBITS ||
!(shdr_prog.sh_flags & SHF_EXECINSTR))
continue;
insns = (struct bpf_insn *) data_prog->d_buf;
processed_sec[shdr.sh_info] = true;
processed_sec[i] = true;
if (parse_relo_and_apply(data, symbols, &shdr, insns,
map_data, nr_maps))
continue;
if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
memcmp(shname_prog, "kretprobe/", 10) == 0 ||
memcmp(shname_prog, "tracepoint/", 11) == 0 ||
memcmp(shname_prog, "xdp", 3) == 0 ||
memcmp(shname_prog, "perf_event", 10) == 0 ||
memcmp(shname_prog, "socket", 6) == 0 ||
memcmp(shname_prog, "cgroup/", 7) == 0)
load_and_attach(shname_prog, insns, data_prog->d_size);
}
}
/* load programs that don't use maps */
for (i = 1; i < ehdr.e_shnum; i++) {
if (processed_sec[i])
continue;
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (memcmp(shname, "kprobe/", 7) == 0 ||
memcmp(shname, "kretprobe/", 10) == 0 ||
memcmp(shname, "tracepoint/", 11) == 0 ||
memcmp(shname, "xdp", 3) == 0 ||
memcmp(shname, "perf_event", 10) == 0 ||
memcmp(shname, "socket", 6) == 0 ||
memcmp(shname, "cgroup/", 7) == 0)
load_and_attach(shname, data->d_buf, data->d_size);
}
ret = 0;
done:
close(fd);
return ret;
}
int load_bpf_file(char *path)
{
return do_load_bpf_file(path, NULL);
}
int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map)
{
return do_load_bpf_file(path, fixup_map);
}
void read_trace_pipe(void)
{
int trace_fd;
trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
if (trace_fd < 0)
return;
while (1) {
static char buf[4096];
ssize_t sz;
sz = read(trace_fd, buf, sizeof(buf));
if (sz > 0) {
buf[sz] = 0;
puts(buf);
}
}
}
#define MAX_SYMS 300000
static struct ksym syms[MAX_SYMS];
static int sym_cnt;
static int ksym_cmp(const void *p1, const void *p2)
{
return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
}
int load_kallsyms(void)
{
FILE *f = fopen("/proc/kallsyms", "r");
char func[256], buf[256];
char symbol;
void *addr;
int i = 0;
if (!f)
return -ENOENT;
while (!feof(f)) {
if (!fgets(buf, sizeof(buf), f))
break;
if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
break;
if (!addr)
continue;
syms[i].addr = (long) addr;
syms[i].name = strdup(func);
i++;
}
sym_cnt = i;
qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
return 0;
}
struct ksym *ksym_search(long key)
{
int start = 0, end = sym_cnt;
int result;
while (start < end) {
size_t mid = start + (end - start) / 2;
result = key - syms[mid].addr;
if (result < 0)
end = mid;
else if (result > 0)
start = mid + 1;
else
return &syms[mid];
}
if (start >= 1 && syms[start - 1].addr < key &&
key < syms[start].addr)
/* valid ksym */
return &syms[start - 1];
/* out of range. return _stext */
return &syms[0];
}
int set_link_xdp_fd(int ifindex, int fd, __u32 flags)
{
struct sockaddr_nl sa;
int sock, seq = 0, len, ret = -1;
char buf[4096];
struct nlattr *nla, *nla_xdp;
struct {
struct nlmsghdr nh;
struct ifinfomsg ifinfo;
char attrbuf[64];
} req;
struct nlmsghdr *nh;
struct nlmsgerr *err;
memset(&sa, 0, sizeof(sa));
sa.nl_family = AF_NETLINK;
sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (sock < 0) {
printf("open netlink socket: %s\n", strerror(errno));
return -1;
}
if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
printf("bind to netlink: %s\n", strerror(errno));
goto cleanup;
}
memset(&req, 0, sizeof(req));
req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
req.nh.nlmsg_type = RTM_SETLINK;
req.nh.nlmsg_pid = 0;
req.nh.nlmsg_seq = ++seq;
req.ifinfo.ifi_family = AF_UNSPEC;
req.ifinfo.ifi_index = ifindex;
/* started nested attribute for XDP */
nla = (struct nlattr *)(((char *)&req)
+ NLMSG_ALIGN(req.nh.nlmsg_len));
nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
nla->nla_len = NLA_HDRLEN;
/* add XDP fd */
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
nla->nla_len += nla_xdp->nla_len;
/* if user passed in any flags, add those too */
if (flags) {
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
nla->nla_len += nla_xdp->nla_len;
}
req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
printf("send to netlink: %s\n", strerror(errno));
goto cleanup;
}
len = recv(sock, buf, sizeof(buf), 0);
if (len < 0) {
printf("recv from netlink: %s\n", strerror(errno));
goto cleanup;
}
for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
nh = NLMSG_NEXT(nh, len)) {
if (nh->nlmsg_pid != getpid()) {
printf("Wrong pid %d, expected %d\n",
nh->nlmsg_pid, getpid());
goto cleanup;
}
if (nh->nlmsg_seq != seq) {
printf("Wrong seq %d, expected %d\n",
nh->nlmsg_seq, seq);
goto cleanup;
}
switch (nh->nlmsg_type) {
case NLMSG_ERROR:
err = (struct nlmsgerr *)NLMSG_DATA(nh);
if (!err->error)
continue;
printf("nlmsg error %s\n", strerror(-err->error));
goto cleanup;
case NLMSG_DONE:
break;
}
}
ret = 0;
cleanup:
close(sock);
return ret;
}