Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2020-05-14

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Merged tag 'perf-for-bpf-2020-05-06' from tip tree that includes CAP_PERFMON.

2) support for narrow loads in bpf_sock_addr progs and additional
   helpers in cg-skb progs, from Andrey.

3) bpf benchmark runner, from Andrii.

4) arm and riscv JIT optimizations, from Luke.

5) bpf iterator infrastructure, from Yonghong.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2020-05-14 20:31:21 -07:00
commit d00f26b623
139 changed files with 5253 additions and 544 deletions

View File

@ -795,6 +795,9 @@ static inline void emit_a32_alu_i(const s8 dst, const u32 val,
case BPF_RSH:
emit(ARM_LSR_I(rd, rd, val), ctx);
break;
case BPF_ARSH:
emit(ARM_ASR_I(rd, rd, val), ctx);
break;
case BPF_NEG:
emit(ARM_RSB_I(rd, rd, val), ctx);
break;
@ -860,8 +863,8 @@ static inline void emit_a32_arsh_r64(const s8 dst[], const s8 src[],
emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx);
emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx);
_emit(ARM_COND_MI, ARM_B(0), ctx);
emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx);
_emit(ARM_COND_PL,
ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx);
emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_ASR, rt), ctx);
arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
@ -1408,7 +1411,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
case BPF_ALU | BPF_MUL | BPF_X:
case BPF_ALU | BPF_LSH | BPF_X:
case BPF_ALU | BPF_RSH | BPF_X:
case BPF_ALU | BPF_ARSH | BPF_K:
case BPF_ALU | BPF_ARSH | BPF_X:
case BPF_ALU64 | BPF_ADD | BPF_K:
case BPF_ALU64 | BPF_ADD | BPF_X:
@ -1465,10 +1467,12 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
case BPF_ALU64 | BPF_MOD | BPF_K:
case BPF_ALU64 | BPF_MOD | BPF_X:
goto notyet;
/* dst = dst >> imm */
/* dst = dst << imm */
case BPF_ALU | BPF_RSH | BPF_K:
/* dst = dst >> imm */
/* dst = dst >> imm (signed) */
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU | BPF_RSH | BPF_K:
case BPF_ALU | BPF_ARSH | BPF_K:
if (unlikely(imm > 31))
return -EINVAL;
if (imm)

View File

@ -94,6 +94,9 @@
#define ARM_INST_LSR_I 0x01a00020
#define ARM_INST_LSR_R 0x01a00030
#define ARM_INST_ASR_I 0x01a00040
#define ARM_INST_ASR_R 0x01a00050
#define ARM_INST_MOV_R 0x01a00000
#define ARM_INST_MOVS_R 0x01b00000
#define ARM_INST_MOV_I 0x03a00000

View File

@ -515,7 +515,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
case BPF_ALU | BPF_LSH | BPF_X:
case BPF_ALU64 | BPF_LSH | BPF_X:
emit(is64 ? rv_sll(rd, rd, rs) : rv_sllw(rd, rd, rs), ctx);
if (!is64)
if (!is64 && !aux->verifier_zext)
emit_zext_32(rd, ctx);
break;
case BPF_ALU | BPF_RSH | BPF_X:
@ -542,13 +542,21 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
/* dst = BSWAP##imm(dst) */
case BPF_ALU | BPF_END | BPF_FROM_LE:
{
int shift = 64 - imm;
emit(rv_slli(rd, rd, shift), ctx);
emit(rv_srli(rd, rd, shift), ctx);
switch (imm) {
case 16:
emit(rv_slli(rd, rd, 48), ctx);
emit(rv_srli(rd, rd, 48), ctx);
break;
case 32:
if (!aux->verifier_zext)
emit_zext_32(rd, ctx);
break;
case 64:
/* Do nothing */
break;
}
break;
}
case BPF_ALU | BPF_END | BPF_FROM_BE:
emit(rv_addi(RV_REG_T2, RV_REG_ZERO, 0), ctx);
@ -692,19 +700,19 @@ out_be:
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU64 | BPF_LSH | BPF_K:
emit(is64 ? rv_slli(rd, rd, imm) : rv_slliw(rd, rd, imm), ctx);
if (!is64)
if (!is64 && !aux->verifier_zext)
emit_zext_32(rd, ctx);
break;
case BPF_ALU | BPF_RSH | BPF_K:
case BPF_ALU64 | BPF_RSH | BPF_K:
emit(is64 ? rv_srli(rd, rd, imm) : rv_srliw(rd, rd, imm), ctx);
if (!is64)
if (!is64 && !aux->verifier_zext)
emit_zext_32(rd, ctx);
break;
case BPF_ALU | BPF_ARSH | BPF_K:
case BPF_ALU64 | BPF_ARSH | BPF_K:
emit(is64 ? rv_srai(rd, rd, imm) : rv_sraiw(rd, rd, imm), ctx);
if (!is64)
if (!is64 && !aux->verifier_zext)
emit_zext_32(rd, ctx);
break;
@ -784,11 +792,15 @@ out_be:
case BPF_JMP32 | BPF_JSGE | BPF_K:
case BPF_JMP | BPF_JSLE | BPF_K:
case BPF_JMP32 | BPF_JSLE | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
case BPF_JMP32 | BPF_JSET | BPF_K:
rvoff = rv_offset(i, off, ctx);
s = ctx->ninsns;
emit_imm(RV_REG_T1, imm, ctx);
if (imm) {
emit_imm(RV_REG_T1, imm, ctx);
rs = RV_REG_T1;
} else {
/* If imm is 0, simply use zero register. */
rs = RV_REG_ZERO;
}
if (!is64) {
if (is_signed_bpf_cond(BPF_OP(code)))
emit_sext_32_rd(&rd, ctx);
@ -799,16 +811,28 @@ out_be:
/* Adjust for extra insns */
rvoff -= (e - s) << 2;
emit_branch(BPF_OP(code), rd, rs, rvoff, ctx);
break;
if (BPF_OP(code) == BPF_JSET) {
/* Adjust for and */
rvoff -= 4;
emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx);
emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff,
ctx);
case BPF_JMP | BPF_JSET | BPF_K:
case BPF_JMP32 | BPF_JSET | BPF_K:
rvoff = rv_offset(i, off, ctx);
s = ctx->ninsns;
if (is_12b_int(imm)) {
emit(rv_andi(RV_REG_T1, rd, imm), ctx);
} else {
emit_branch(BPF_OP(code), rd, RV_REG_T1, rvoff, ctx);
emit_imm(RV_REG_T1, imm, ctx);
emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx);
}
/* For jset32, we should clear the upper 32 bits of t1, but
* sign-extension is sufficient here and saves one instruction,
* as t1 is used only in comparison against zero.
*/
if (!is64 && imm < 0)
emit(rv_addiw(RV_REG_T1, RV_REG_T1, 0), ctx);
e = ctx->ninsns;
rvoff -= (e - s) << 2;
emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, ctx);
break;
/* function call */

View File

@ -1475,8 +1475,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
for (i = 0; i < insn_cnt; i++, insn++) {
const s32 imm32 = insn->imm;
const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true;
const bool sstk = insn->src_reg == BPF_REG_AX ? false : true;
const bool dstk = insn->dst_reg != BPF_REG_AX;
const bool sstk = insn->src_reg != BPF_REG_AX;
const u8 code = insn->code;
const u8 *dst = bpf2ia32[insn->dst_reg];
const u8 *src = bpf2ia32[insn->src_reg];

View File

@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = {
.proc_release = seq_release_net,
};
int bpf_iter_init_seq_net(void *priv_data)
{
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
p->net = get_net(current->nsproxy->net_ns);
#endif
return 0;
}
void bpf_iter_fini_seq_net(void *priv_data)
{
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
put_net(p->net);
#endif
}
struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
struct proc_dir_entry *parent, const struct seq_operations *ops,
unsigned int state_size, void *data)

View File

@ -31,6 +31,7 @@ struct seq_file;
struct btf;
struct btf_type;
struct exception_table_entry;
struct seq_operations;
extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
@ -319,6 +320,7 @@ enum bpf_reg_type {
PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */
PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */
PTR_TO_BTF_ID, /* reg points to kernel struct */
PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */
};
/* The information passed from prog-specific *_is_valid_access
@ -641,6 +643,12 @@ struct bpf_jit_poke_descriptor {
u16 reason;
};
/* reg_type info for ctx arguments */
struct bpf_ctx_arg_aux {
u32 offset;
enum bpf_reg_type reg_type;
};
struct bpf_prog_aux {
atomic64_t refcnt;
u32 used_map_cnt;
@ -652,6 +660,8 @@ struct bpf_prog_aux {
u32 func_cnt; /* used by non-func prog as the number of func progs */
u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
u32 attach_btf_id; /* in-kernel BTF type id to attach to */
u32 ctx_arg_info_size;
const struct bpf_ctx_arg_aux *ctx_arg_info;
struct bpf_prog *linked_prog;
bool verifier_zext; /* Zero extensions has been inserted by verifier. */
bool offload_requested;
@ -1021,6 +1031,7 @@ static inline void bpf_enable_instrumentation(void)
extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops;
extern const struct file_operations bpf_iter_fops;
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
extern const struct bpf_prog_ops _name ## _prog_ops; \
@ -1080,6 +1091,7 @@ int generic_map_update_batch(struct bpf_map *map,
int generic_map_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr);
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
extern int sysctl_unprivileged_bpf_disabled;
@ -1126,6 +1138,40 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
int bpf_obj_get_user(const char __user *pathname, int flags);
#define BPF_ITER_FUNC_PREFIX "bpf_iter_"
#define DEFINE_BPF_ITER_FUNC(target, args...) \
extern int bpf_iter_ ## target(args); \
int __init bpf_iter_ ## target(args) { return 0; }
typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
#define BPF_ITER_CTX_ARG_MAX 2
struct bpf_iter_reg {
const char *target;
const struct seq_operations *seq_ops;
bpf_iter_init_seq_priv_t init_seq_private;
bpf_iter_fini_seq_priv_t fini_seq_private;
u32 seq_priv_size;
u32 ctx_arg_info_size;
struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
};
struct bpf_iter_meta {
__bpf_md_ptr(struct seq_file *, seq);
u64 session_id;
u64 seq_num;
};
int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
bool bpf_iter_prog_supported(struct bpf_prog *prog);
int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int bpf_iter_new_fd(struct bpf_link *link);
bool bpf_link_is_iter(struct bpf_link *link);
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop);
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,

View File

@ -124,3 +124,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
#ifdef CONFIG_CGROUP_BPF
BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup)
#endif
BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)

View File

@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct
extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
static inline bool perfmon_capable(void)
{
return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN);
}
/* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);

View File

@ -545,10 +545,8 @@ struct bpf_prog {
unsigned int (*bpf_func)(const void *ctx,
const struct bpf_insn *insn);
/* Instructions for interpreter */
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
};
struct sock_filter insns[0];
struct bpf_insn insnsi[];
};
struct sk_filter {

View File

@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
void *data);
extern struct pid *tgid_pidfd_to_pid(const struct file *file);
extern int bpf_iter_init_seq_net(void *priv_data);
extern void bpf_iter_fini_seq_net(void *priv_data);
#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
* The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must

View File

@ -35,8 +35,14 @@ int inet_shutdown(struct socket *sock, int how);
int inet_listen(struct socket *sock, int backlog);
void inet_sock_destruct(struct sock *sk);
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
/* Don't allocate port at this moment, defer to connect. */
#define BIND_FORCE_ADDRESS_NO_PORT (1 << 0)
/* Grab and release socket lock. */
#define BIND_WITH_LOCK (1 << 1)
/* Called from BPF program. */
#define BIND_FROM_BPF (1 << 2)
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
bool force_bind_address_no_port, bool with_lock);
u32 flags);
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
int peer);
int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);

View File

@ -544,6 +544,13 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
}
#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
struct bpf_iter__ipv6_route {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct fib6_info *, rt);
};
#endif
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
static inline bool fib6_has_custom_rules(const struct net *net)
{

View File

@ -63,7 +63,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly;
/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
struct ipv6_bpf_stub {
int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
bool force_bind_address_no_port, bool with_lock);
u32 flags);
struct sock *(*udp6_lib_lookup)(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,

View File

@ -50,7 +50,6 @@ struct xdp_umem {
u32 headroom;
u32 chunk_size_nohr;
struct user_struct *user;
unsigned long address;
refcount_t users;
struct work_struct work;
struct page **pgs;
@ -62,8 +61,8 @@ struct xdp_umem {
struct net_device *dev;
struct xdp_umem_fq_reuse *fq_reuse;
bool zc;
spinlock_t xsk_list_lock;
struct list_head xsk_list;
spinlock_t xsk_tx_list_lock;
struct list_head xsk_tx_list;
};
/* Nodes are linked in the struct xdp_sock map_list field, and used to

View File

@ -116,6 +116,7 @@ enum bpf_cmd {
BPF_LINK_GET_FD_BY_ID,
BPF_LINK_GET_NEXT_ID,
BPF_ENABLE_STATS,
BPF_ITER_CREATE,
};
enum bpf_map_type {
@ -218,6 +219,7 @@ enum bpf_attach_type {
BPF_TRACE_FEXIT,
BPF_MODIFY_RETURN,
BPF_LSM_MAC,
BPF_TRACE_ITER,
__MAX_BPF_ATTACH_TYPE
};
@ -228,6 +230,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
BPF_LINK_TYPE_TRACING = 2,
BPF_LINK_TYPE_CGROUP = 3,
BPF_LINK_TYPE_ITER = 4,
MAX_BPF_LINK_TYPE,
};
@ -612,6 +615,11 @@ union bpf_attr {
__u32 type;
} enable_stats;
struct { /* struct used by BPF_ITER_CREATE command */
__u32 link_fd;
__u32 flags;
} iter_create;
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@ -667,8 +675,8 @@ union bpf_attr {
* For tracing programs, safely attempt to read *size* bytes from
* kernel space address *unsafe_ptr* and store the data in *dst*.
*
* Generally, use bpf_probe_read_user() or bpf_probe_read_kernel()
* instead.
* Generally, use **bpf_probe_read_user**\ () or
* **bpf_probe_read_kernel**\ () instead.
* Return
* 0 on success, or a negative error in case of failure.
*
@ -676,7 +684,7 @@ union bpf_attr {
* Description
* Return the time elapsed since system boot, in nanoseconds.
* Does not include time the system was suspended.
* See: clock_gettime(CLOCK_MONOTONIC)
* See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
* Return
* Current *ktime*.
*
@ -1535,11 +1543,11 @@ union bpf_attr {
* int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
* Description
* Copy a NUL terminated string from an unsafe kernel address
* *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for
* *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
* more details.
*
* Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str()
* instead.
* Generally, use **bpf_probe_read_user_str**\ () or
* **bpf_probe_read_kernel_str**\ () instead.
* Return
* On success, the strictly positive length of the string,
* including the trailing NUL character. On error, a negative
@ -1567,7 +1575,7 @@ union bpf_attr {
*
* u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
* Description
* Equivalent to bpf_get_socket_cookie() helper that accepts
* Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
* *skb*, but gets socket from **struct bpf_sock_ops** context.
* Return
* A 8-byte long non-decreasing number.
@ -1596,6 +1604,7 @@ union bpf_attr {
* The option value of length *optlen* is pointed by *optval*.
*
* *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**.
@ -1664,12 +1673,12 @@ union bpf_attr {
*
* The lower two bits of *flags* are used as the return code if
* the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to XDP_TX, as chosen by
* the caller. Any higher bits in the *flags* argument must be
* one of the XDP program return codes up to **XDP_TX**, as chosen
* by the caller. Any higher bits in the *flags* argument must be
* unset.
*
* See also bpf_redirect(), which only supports redirecting to an
* ifindex, but doesn't require a map to do so.
* See also **bpf_redirect**\ (), which only supports redirecting
* to an ifindex, but doesn't require a map to do so.
* Return
* **XDP_REDIRECT** on success, or the value of the two lower bits
* of the *flags* argument on error.
@ -1777,7 +1786,7 @@ union bpf_attr {
* the time running for event since last normalization. The
* enabled and running times are accumulated since the perf event
* open. To achieve scaling factor between two invocations of an
* eBPF program, users can can use CPU id as the key (which is
* eBPF program, users can use CPU id as the key (which is
* typical for perf array usage model) to remember the previous
* value and do the calculation inside the eBPF program.
* Return
@ -1804,6 +1813,7 @@ union bpf_attr {
* *opval* and of length *optlen*.
*
* *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**.
@ -1825,7 +1835,7 @@ union bpf_attr {
* The first argument is the context *regs* on which the kprobe
* works.
*
* This helper works by setting setting the PC (program counter)
* This helper works by setting the PC (program counter)
* to an override function which is run in place of the original
* probed function. This means the probed function is not run at
* all. The replacement function just returns with the required
@ -1994,10 +2004,11 @@ union bpf_attr {
*
* This helper works for IPv4 and IPv6, TCP and UDP sockets. The
* domain (*addr*\ **->sa_family**) must be **AF_INET** (or
* **AF_INET6**). Looking for a free port to bind to can be
* expensive, therefore binding to port is not permitted by the
* helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
* must be set to zero.
* **AF_INET6**). It's advised to pass zero port (**sin_port**
* or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
* behavior and lets the kernel efficiently pick up an unused
* port as long as 4-tuple is unique. Passing non-zero port might
* lead to degraded performance.
* Return
* 0 on success, or a negative error in case of failure.
*
@ -2291,7 +2302,7 @@ union bpf_attr {
* **bpf_rc_keydown**\ () again with the same values, or calling
* **bpf_rc_repeat**\ ().
*
* Some protocols include a toggle bit, in case the button was
* Some protocols include a toggle bit, in case the button was
* released and pressed again between consecutive scancodes.
*
* The *ctx* should point to the lirc sample as passed into
@ -2637,7 +2648,6 @@ union bpf_attr {
*
* *th* points to the start of the TCP header, while *th_len*
* contains **sizeof**\ (**struct tcphdr**).
*
* Return
* 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
* error otherwise.
@ -2820,7 +2830,6 @@ union bpf_attr {
*
* *th* points to the start of the TCP header, while *th_len*
* contains the length of the TCP header.
*
* Return
* On success, lower 32 bits hold the generated SYN cookie in
* followed by 16 bits which hold the MSS value for that cookie,
@ -2903,7 +2912,7 @@ union bpf_attr {
* // size, after checking its boundaries.
* }
*
* In comparison, using **bpf_probe_read_user()** helper here
* In comparison, using **bpf_probe_read_user**\ () helper here
* instead to read the string would require to estimate the length
* at compile time, and would often result in copying more memory
* than necessary.
@ -2921,14 +2930,14 @@ union bpf_attr {
* int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
* Description
* Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
* to *dst*. Same semantics as with bpf_probe_read_user_str() apply.
* to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
* Return
* On success, the strictly positive length of the string, including
* On success, the strictly positive length of the string, including
* the trailing NUL character. On error, a negative value.
*
* int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
* Description
* Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock.
* Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
* *rcv_nxt* is the ack_seq to be sent out.
* Return
* 0 on success, or a negative error in case of failure.
@ -2956,19 +2965,19 @@ union bpf_attr {
* int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
* Description
* For an eBPF program attached to a perf event, retrieve the
* branch records (struct perf_branch_entry) associated to *ctx*
* and store it in the buffer pointed by *buf* up to size
* branch records (**struct perf_branch_entry**) associated to *ctx*
* and store it in the buffer pointed by *buf* up to size
* *size* bytes.
* Return
* On success, number of bytes written to *buf*. On error, a
* negative value.
*
* The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
* instead return the number of bytes required to store all the
* instead return the number of bytes required to store all the
* branch entries. If this flag is set, *buf* may be NULL.
*
* **-EINVAL** if arguments invalid or **size** not a multiple
* of sizeof(struct perf_branch_entry).
* of **sizeof**\ (**struct perf_branch_entry**\ ).
*
* **-ENOENT** if architecture does not support branch records.
*
@ -2976,8 +2985,8 @@ union bpf_attr {
* Description
* Returns 0 on success, values for *pid* and *tgid* as seen from the current
* *namespace* will be returned in *nsdata*.
*
* On failure, the returned value is one of the following:
* Return
* 0 on success, or one of the following in case of failure:
*
* **-EINVAL** if dev and inum supplied don't match dev_t and inode number
* with nsfs of current task, or if dev conversion to dev_t lost high bits.
@ -3016,8 +3025,8 @@ union bpf_attr {
* a global identifier that can be assumed unique. If *ctx* is
* NULL, then the helper returns the cookie for the initial
* network namespace. The cookie itself is very similar to that
* of bpf_get_socket_cookie() helper, but for network namespaces
* instead of sockets.
* of **bpf_get_socket_cookie**\ () helper, but for network
* namespaces instead of sockets.
* Return
* A 8-byte long opaque number.
*
@ -3052,22 +3061,98 @@ union bpf_attr {
*
* The *flags* argument must be zero.
* Return
* 0 on success, or a negative errno in case of failure.
* 0 on success, or a negative error in case of failure:
*
* * **-EINVAL** Unsupported flags specified.
* * **-ENOENT** Socket is unavailable for assignment.
* * **-ENETUNREACH** Socket is unreachable (wrong netns).
* * **-EOPNOTSUPP** Unsupported operation, for example a
* call from outside of TC ingress.
* * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
* **-EINVAL** if specified *flags* are not supported.
*
* **-ENOENT** if the socket is unavailable for assignment.
*
* **-ENETUNREACH** if the socket is unreachable (wrong netns).
*
* **-EOPNOTSUPP** if the operation is not supported, for example
* a call from outside of TC ingress.
*
* **-ESOCKTNOSUPPORT** if the socket type is not supported
* (reuseport).
*
* u64 bpf_ktime_get_boot_ns(void)
* Description
* Return the time elapsed since system boot, in nanoseconds.
* Does include the time the system was suspended.
* See: clock_gettime(CLOCK_BOOTTIME)
* See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
* Return
* Current *ktime*.
*
* int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
* Description
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
* out the format string.
* The *m* represents the seq_file. The *fmt* and *fmt_size* are for
* the format string itself. The *data* and *data_len* are format string
* arguments. The *data* are a **u64** array and corresponding format string
* values are stored in the array. For strings and pointers where pointees
* are accessed, only the pointer values are stored in the *data* array.
* The *data_len* is the size of *data* in bytes.
*
* Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
* Reading kernel memory may fail due to either invalid address or
* valid address but requiring a major memory fault. If reading kernel memory
* fails, the string for **%s** will be an empty string, and the ip
* address for **%p{i,I}{4,6}** will be 0. Not returning error to
* bpf program is consistent with what **bpf_trace_printk**\ () does for now.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EBUSY** if per-CPU memory copy buffer is busy, can try again
* by returning 1 from bpf program.
*
* **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
*
* **-E2BIG** if *fmt* contains too many format specifiers.
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
* Description
* **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
* The *m* represents the seq_file. The *data* and *len* represent the
* data to write in bytes.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
* Description
* Return the cgroup v2 id of the socket *sk*.
*
* *sk* must be a non-**NULL** pointer to a full socket, e.g. one
* returned from **bpf_sk_lookup_xxx**\ (),
* **bpf_sk_fullsock**\ (), etc. The format of returned id is
* same as in **bpf_skb_cgroup_id**\ ().
*
* This helper is available only if the kernel was compiled with
* the **CONFIG_SOCK_CGROUP_DATA** configuration option.
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
* Description
* Return id of cgroup v2 that is ancestor of cgroup associated
* with the *sk* at the *ancestor_level*. The root cgroup is at
* *ancestor_level* zero and each step down the hierarchy
* increments the level. If *ancestor_level* == level of cgroup
* associated with *sk*, then return value will be same as that
* of **bpf_sk_cgroup_id**\ ().
*
* The helper is useful to implement policies based on cgroups
* that are upper in hierarchy than immediate cgroup associated
* with *sk*.
*
* The format of returned id and helper limitations are same as in
* **bpf_sk_cgroup_id**\ ().
* Return
* The id is returned or 0 in case the id could not be retrieved.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@ -3195,7 +3280,11 @@ union bpf_attr {
FN(get_netns_cookie), \
FN(get_current_ancestor_cgroup_id), \
FN(sk_assign), \
FN(ktime_get_boot_ns),
FN(ktime_get_boot_ns), \
FN(seq_printf), \
FN(seq_write), \
FN(sk_cgroup_id), \
FN(sk_ancestor_cgroup_id),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@ -3673,7 +3762,7 @@ struct bpf_sock_addr {
__u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
* Stored in network byte order.
*/
__u32 user_port; /* Allows 4-byte read and write.
__u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.
* Stored in network byte order
*/
__u32 family; /* Allows 4-byte read, but no write */

View File

@ -367,8 +367,14 @@ struct vfs_ns_cap_data {
#define CAP_AUDIT_READ 37
/*
* Allow system performance and observability privileged operations
* using perf_events, i915_perf and other kernel subsystems
*/
#define CAP_LAST_CAP CAP_AUDIT_READ
#define CAP_PERFMON 38
#define CAP_LAST_CAP CAP_PERFMON
#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)

View File

@ -2,7 +2,7 @@
obj-y := core.o
CFLAGS_core.o += $(call cc-disable-warning, override-init)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o

539
kernel/bpf/bpf_iter.c Normal file
View File

@ -0,0 +1,539 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020 Facebook */
#include <linux/fs.h>
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
struct bpf_iter_target_info {
struct list_head list;
const struct bpf_iter_reg *reg_info;
u32 btf_id; /* cached value */
};
struct bpf_iter_link {
struct bpf_link link;
struct bpf_iter_target_info *tinfo;
};
struct bpf_iter_priv_data {
struct bpf_iter_target_info *tinfo;
struct bpf_prog *prog;
u64 session_id;
u64 seq_num;
bool done_stop;
u8 target_private[] __aligned(8);
};
static struct list_head targets = LIST_HEAD_INIT(targets);
static DEFINE_MUTEX(targets_mutex);
/* protect bpf_iter_link changes */
static DEFINE_MUTEX(link_mutex);
/* incremented on every opened seq_file */
static atomic64_t session_id;
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
static void bpf_iter_inc_seq_num(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->seq_num++;
}
static void bpf_iter_dec_seq_num(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->seq_num--;
}
static void bpf_iter_done_stop(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->done_stop = true;
}
/* bpf_seq_read, a customized and simpler version for bpf iterator.
* no_llseek is assumed for this file.
* The following are differences from seq_read():
* . fixed buffer size (PAGE_SIZE)
* . assuming no_llseek
* . stop() may call bpf program, handling potential overflow there
*/
static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
loff_t *ppos)
{
struct seq_file *seq = file->private_data;
size_t n, offs, copied = 0;
int err = 0;
void *p;
mutex_lock(&seq->lock);
if (!seq->buf) {
seq->size = PAGE_SIZE;
seq->buf = kmalloc(seq->size, GFP_KERNEL);
if (!seq->buf) {
err = -ENOMEM;
goto done;
}
}
if (seq->count) {
n = min(seq->count, size);
err = copy_to_user(buf, seq->buf + seq->from, n);
if (err) {
err = -EFAULT;
goto done;
}
seq->count -= n;
seq->from += n;
copied = n;
goto done;
}
seq->from = 0;
p = seq->op->start(seq, &seq->index);
if (!p)
goto stop;
if (IS_ERR(p)) {
err = PTR_ERR(p);
seq->op->stop(seq, p);
seq->count = 0;
goto done;
}
err = seq->op->show(seq, p);
if (err > 0) {
/* object is skipped, decrease seq_num, so next
* valid object can reuse the same seq_num.
*/
bpf_iter_dec_seq_num(seq);
seq->count = 0;
} else if (err < 0 || seq_has_overflowed(seq)) {
if (!err)
err = -E2BIG;
seq->op->stop(seq, p);
seq->count = 0;
goto done;
}
while (1) {
loff_t pos = seq->index;
offs = seq->count;
p = seq->op->next(seq, p, &seq->index);
if (pos == seq->index) {
pr_info_ratelimited("buggy seq_file .next function %ps "
"did not updated position index\n",
seq->op->next);
seq->index++;
}
if (IS_ERR_OR_NULL(p))
break;
/* got a valid next object, increase seq_num */
bpf_iter_inc_seq_num(seq);
if (seq->count >= size)
break;
err = seq->op->show(seq, p);
if (err > 0) {
bpf_iter_dec_seq_num(seq);
seq->count = offs;
} else if (err < 0 || seq_has_overflowed(seq)) {
seq->count = offs;
if (offs == 0) {
if (!err)
err = -E2BIG;
seq->op->stop(seq, p);
goto done;
}
break;
}
}
stop:
offs = seq->count;
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
if (!seq_has_overflowed(seq)) {
bpf_iter_done_stop(seq);
} else {
seq->count = offs;
if (offs == 0) {
err = -E2BIG;
goto done;
}
}
}
n = min(seq->count, size);
err = copy_to_user(buf, seq->buf, n);
if (err) {
err = -EFAULT;
goto done;
}
copied = n;
seq->count -= n;
seq->from = n;
done:
if (!copied)
copied = err;
else
*ppos += copied;
mutex_unlock(&seq->lock);
return copied;
}
static int iter_open(struct inode *inode, struct file *file)
{
struct bpf_iter_link *link = inode->i_private;
return prepare_seq_file(file, link);
}
static int iter_release(struct inode *inode, struct file *file)
{
struct bpf_iter_priv_data *iter_priv;
struct seq_file *seq;
seq = file->private_data;
if (!seq)
return 0;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
if (iter_priv->tinfo->reg_info->fini_seq_private)
iter_priv->tinfo->reg_info->fini_seq_private(seq->private);
bpf_prog_put(iter_priv->prog);
seq->private = iter_priv;
return seq_release_private(inode, file);
}
const struct file_operations bpf_iter_fops = {
.open = iter_open,
.llseek = no_llseek,
.read = bpf_seq_read,
.release = iter_release,
};
/* The argument reg_info will be cached in bpf_iter_target_info.
* The common practice is to declare target reg_info as
* a const static variable and passed as an argument to
* bpf_iter_reg_target().
*/
int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
return -ENOMEM;
tinfo->reg_info = reg_info;
INIT_LIST_HEAD(&tinfo->list);
mutex_lock(&targets_mutex);
list_add(&tinfo->list, &targets);
mutex_unlock(&targets_mutex);
return 0;
}
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
bool found = false;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (reg_info == tinfo->reg_info) {
list_del(&tinfo->list);
kfree(tinfo);
found = true;
break;
}
}
mutex_unlock(&targets_mutex);
WARN_ON(found == false);
}
static void cache_btf_id(struct bpf_iter_target_info *tinfo,
struct bpf_prog *prog)
{
tinfo->btf_id = prog->aux->attach_btf_id;
}
bool bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
u32 prog_btf_id = prog->aux->attach_btf_id;
const char *prefix = BPF_ITER_FUNC_PREFIX;
struct bpf_iter_target_info *tinfo;
int prefix_len = strlen(prefix);
bool supported = false;
if (strncmp(attach_fname, prefix, prefix_len))
return false;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
supported = true;
break;
}
if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
cache_btf_id(tinfo, prog);
supported = true;
break;
}
}
mutex_unlock(&targets_mutex);
if (supported) {
prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
}
return supported;
}
static void bpf_iter_link_release(struct bpf_link *link)
{
}
static void bpf_iter_link_dealloc(struct bpf_link *link)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
kfree(iter_link);
}
static int bpf_iter_link_replace(struct bpf_link *link,
struct bpf_prog *new_prog,
struct bpf_prog *old_prog)
{
int ret = 0;
mutex_lock(&link_mutex);
if (old_prog && link->prog != old_prog) {
ret = -EPERM;
goto out_unlock;
}
if (link->prog->type != new_prog->type ||
link->prog->expected_attach_type != new_prog->expected_attach_type ||
link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
ret = -EINVAL;
goto out_unlock;
}
old_prog = xchg(&link->prog, new_prog);
bpf_prog_put(old_prog);
out_unlock:
mutex_unlock(&link_mutex);
return ret;
}
static const struct bpf_link_ops bpf_iter_link_lops = {
.release = bpf_iter_link_release,
.dealloc = bpf_iter_link_dealloc,
.update_prog = bpf_iter_link_replace,
};
bool bpf_link_is_iter(struct bpf_link *link)
{
return link->ops == &bpf_iter_link_lops;
}
int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_link_primer link_primer;
struct bpf_iter_target_info *tinfo;
struct bpf_iter_link *link;
bool existed = false;
u32 prog_btf_id;
int err;
if (attr->link_create.target_fd || attr->link_create.flags)
return -EINVAL;
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (tinfo->btf_id == prog_btf_id) {
existed = true;
break;
}
}
mutex_unlock(&targets_mutex);
if (!existed)
return -ENOENT;
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
link->tinfo = tinfo;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
return err;
}
return bpf_link_settle(&link_primer);
}
static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
struct bpf_iter_target_info *tinfo,
struct bpf_prog *prog)
{
priv_data->tinfo = tinfo;
priv_data->prog = prog;
priv_data->session_id = atomic64_inc_return(&session_id);
priv_data->seq_num = 0;
priv_data->done_stop = false;
}
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
{
struct bpf_iter_priv_data *priv_data;
struct bpf_iter_target_info *tinfo;
struct bpf_prog *prog;
u32 total_priv_dsize;
struct seq_file *seq;
int err = 0;
mutex_lock(&link_mutex);
prog = link->link.prog;
bpf_prog_inc(prog);
mutex_unlock(&link_mutex);
tinfo = link->tinfo;
total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
tinfo->reg_info->seq_priv_size;
priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops,
total_priv_dsize);
if (!priv_data) {
err = -ENOMEM;
goto release_prog;
}
if (tinfo->reg_info->init_seq_private) {
err = tinfo->reg_info->init_seq_private(priv_data->target_private);
if (err)
goto release_seq_file;
}
init_seq_meta(priv_data, tinfo, prog);
seq = file->private_data;
seq->private = priv_data->target_private;
return 0;
release_seq_file:
seq_release_private(file->f_inode, file);
file->private_data = NULL;
release_prog:
bpf_prog_put(prog);
return err;
}
int bpf_iter_new_fd(struct bpf_link *link)
{
struct file *file;
unsigned int flags;
int err, fd;
if (link->ops != &bpf_iter_link_lops)
return -EINVAL;
flags = O_RDONLY | O_CLOEXEC;
fd = get_unused_fd_flags(flags);
if (fd < 0)
return fd;
file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto free_fd;
}
err = prepare_seq_file(file,
container_of(link, struct bpf_iter_link, link));
if (err)
goto free_file;
fd_install(fd, file);
return fd;
free_file:
fput(file);
free_fd:
put_unused_fd(fd);
return err;
}
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
{
struct bpf_iter_priv_data *iter_priv;
struct seq_file *seq;
void *seq_priv;
seq = meta->seq;
if (seq->file->f_op != &bpf_iter_fops)
return NULL;
seq_priv = seq->private;
iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
target_private);
if (in_stop && iter_priv->done_stop)
return NULL;
meta->session_id = iter_priv->session_id;
meta->seq_num = iter_priv->seq_num;
return iter_priv->prog;
}
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
int ret;
rcu_read_lock();
migrate_disable();
ret = BPF_PROG_RUN(prog, ctx);
migrate_enable();
rcu_read_unlock();
/* bpf program can only return 0 or 1:
* 0 : okay
* 1 : retry the same object
* The bpf_iter_run_prog() return value
* will be seq_ops->show() return value.
*/
return ret == 0 ? 0 : -EAGAIN;
}

View File

@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
u32 nr_args, arg;
int ret;
int i, ret;
if (off % 8) {
bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
@ -3791,6 +3791,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* this is a pointer to another type */
info->reg_type = PTR_TO_BTF_ID;
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
if (ctx_arg_info->offset == off) {
info->reg_type = ctx_arg_info->reg_type;
break;
}
}
if (tgt_prog) {
ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
@ -3830,6 +3838,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
const char *tname, *mname;
u32 vlen;
again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
@ -3838,7 +3847,43 @@ again:
return -EINVAL;
}
vlen = btf_type_vlen(t);
if (off + size > t->size) {
/* If the last element is a variable size array, we may
* need to relax the rule.
*/
struct btf_array *array_elem;
if (vlen == 0)
goto error;
member = btf_type_member(t) + vlen - 1;
mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
NULL);
if (!btf_type_is_array(mtype))
goto error;
array_elem = (struct btf_array *)(mtype + 1);
if (array_elem->nelems != 0)
goto error;
moff = btf_member_bit_offset(t, member) / 8;
if (off < moff)
goto error;
/* Only allow structure for now, can be relaxed for
* other types later.
*/
elem_type = btf_type_skip_modifiers(btf_vmlinux,
array_elem->type, NULL);
if (!btf_type_is_struct(elem_type))
goto error;
off = (off - moff) % elem_type->size;
return btf_struct_access(log, elem_type, off, size, atype,
next_btf_id);
error:
bpf_log(log, "access beyond struct %s at off %u size %u\n",
tname, off, size);
return -EACCES;

View File

@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
{
struct bpf_link *link = arg;
return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
&bpffs_obj_fops);
bpf_link_is_iter(link) ?
&bpf_iter_fops : &bpffs_obj_fops);
}
static struct dentry *

102
kernel/bpf/map_iter.c Normal file
View File

@ -0,0 +1,102 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020 Facebook */
#include <linux/bpf.h>
#include <linux/fs.h>
#include <linux/filter.h>
#include <linux/kernel.h>
struct bpf_iter_seq_map_info {
u32 mid;
};
static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_map_info *info = seq->private;
struct bpf_map *map;
map = bpf_map_get_curr_or_next(&info->mid);
if (!map)
return NULL;
++*pos;
return map;
}
static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_map_info *info = seq->private;
struct bpf_map *map;
++*pos;
++info->mid;
bpf_map_put((struct bpf_map *)v);
map = bpf_map_get_curr_or_next(&info->mid);
if (!map)
return NULL;
return map;
}
struct bpf_iter__bpf_map {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct bpf_map *, map);
};
DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map)
static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop)
{
struct bpf_iter__bpf_map ctx;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int ret = 0;
ctx.meta = &meta;
ctx.map = v;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, in_stop);
if (prog)
ret = bpf_iter_run_prog(prog, &ctx);
return ret;
}
static int bpf_map_seq_show(struct seq_file *seq, void *v)
{
return __bpf_map_seq_show(seq, v, false);
}
static void bpf_map_seq_stop(struct seq_file *seq, void *v)
{
if (!v)
(void)__bpf_map_seq_show(seq, v, true);
else
bpf_map_put((struct bpf_map *)v);
}
static const struct seq_operations bpf_map_seq_ops = {
.start = bpf_map_seq_start,
.next = bpf_map_seq_next,
.stop = bpf_map_seq_stop,
.show = bpf_map_seq_show,
};
static const struct bpf_iter_reg bpf_map_reg_info = {
.target = "bpf_map",
.seq_ops = &bpf_map_seq_ops,
.init_seq_private = NULL,
.fini_seq_private = NULL,
.seq_priv_size = sizeof(struct bpf_iter_seq_map_info),
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map, map),
PTR_TO_BTF_ID_OR_NULL },
},
};
static int __init bpf_map_iter_init(void)
{
return bpf_iter_reg_target(&bpf_map_reg_info);
}
late_initcall(bpf_map_iter_init);

View File

@ -19,7 +19,7 @@ struct bpf_queue_stack {
u32 head, tail;
u32 size; /* max_entries + 1 */
char elements[0] __aligned(8);
char elements[] __aligned(8);
};
static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map)

View File

@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
case BPF_TRACE_ITER:
return BPF_PROG_TYPE_TRACING;
default:
return BPF_PROG_TYPE_UNSPEC;
}
@ -2932,6 +2934,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
return err;
}
struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
{
struct bpf_map *map;
spin_lock_bh(&map_idr_lock);
again:
map = idr_get_next(&map_idr, id);
if (map) {
map = __bpf_map_inc_not_zero(map, false);
if (IS_ERR(map)) {
(*id)++;
goto again;
}
}
spin_unlock_bh(&map_idr_lock);
return map;
}
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
struct bpf_prog *bpf_prog_by_id(u32 id)
@ -3729,6 +3750,15 @@ err_put:
return err;
}
static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
if (attr->link_create.attach_type == BPF_TRACE_ITER &&
prog->expected_attach_type == BPF_TRACE_ITER)
return bpf_iter_link_attach(attr, prog);
return -EINVAL;
}
#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
static int link_create(union bpf_attr *attr)
{
@ -3765,6 +3795,9 @@ static int link_create(union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
ret = cgroup_bpf_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_TRACING:
ret = tracing_bpf_link_attach(attr, prog);
break;
default:
ret = -EINVAL;
}
@ -3927,6 +3960,29 @@ static int bpf_enable_stats(union bpf_attr *attr)
return -EINVAL;
}
#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
static int bpf_iter_create(union bpf_attr *attr)
{
struct bpf_link *link;
int err;
if (CHECK_ATTR(BPF_ITER_CREATE))
return -EINVAL;
if (attr->iter_create.flags)
return -EINVAL;
link = bpf_link_get_from_fd(attr->iter_create.link_fd);
if (IS_ERR(link))
return PTR_ERR(link);
err = bpf_iter_new_fd(link);
bpf_link_put(link);
return err;
}
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr;
@ -4054,6 +4110,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_ENABLE_STATS:
err = bpf_enable_stats(&attr);
break;
case BPF_ITER_CREATE:
err = bpf_iter_create(&attr);
break;
default:
err = -EINVAL;
break;

353
kernel/bpf/task_iter.c Normal file
View File

@ -0,0 +1,353 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020 Facebook */
#include <linux/init.h>
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/filter.h>
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
};
struct bpf_iter_seq_task_info {
/* The first field must be struct bpf_iter_seq_task_common.
* this is assumed by {init, fini}_seq_pidns() callback functions.
*/
struct bpf_iter_seq_task_common common;
u32 tid;
};
static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
u32 *tid)
{
struct task_struct *task = NULL;
struct pid *pid;
rcu_read_lock();
retry:
pid = idr_get_next(&ns->idr, tid);
if (pid) {
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
goto retry;
}
}
rcu_read_unlock();
return task;
}
static void *task_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
task = task_seq_get_next(info->common.ns, &info->tid);
if (!task)
return NULL;
++*pos;
return task;
}
static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
task = task_seq_get_next(info->common.ns, &info->tid);
if (!task)
return NULL;
return task;
}
struct bpf_iter__task {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct task_struct *, task);
};
DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
bool in_stop)
{
struct bpf_iter_meta meta;
struct bpf_iter__task ctx;
struct bpf_prog *prog;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, in_stop);
if (!prog)
return 0;
meta.seq = seq;
ctx.meta = &meta;
ctx.task = task;
return bpf_iter_run_prog(prog, &ctx);
}
static int task_seq_show(struct seq_file *seq, void *v)
{
return __task_seq_show(seq, v, false);
}
static void task_seq_stop(struct seq_file *seq, void *v)
{
if (!v)
(void)__task_seq_show(seq, v, true);
else
put_task_struct((struct task_struct *)v);
}
static const struct seq_operations task_seq_ops = {
.start = task_seq_start,
.next = task_seq_next,
.stop = task_seq_stop,
.show = task_seq_show,
};
struct bpf_iter_seq_task_file_info {
/* The first field must be struct bpf_iter_seq_task_common.
* this is assumed by {init, fini}_seq_pidns() callback functions.
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
struct files_struct *files;
u32 tid;
u32 fd;
};
static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
struct task_struct **task, struct files_struct **fstruct)
{
struct pid_namespace *ns = info->common.ns;
u32 curr_tid = info->tid, max_fds;
struct files_struct *curr_files;
struct task_struct *curr_task;
int curr_fd = info->fd;
/* If this function returns a non-NULL file object,
* it held a reference to the task/files_struct/file.
* Otherwise, it does not hold any reference.
*/
again:
if (*task) {
curr_task = *task;
curr_files = *fstruct;
curr_fd = info->fd;
} else {
curr_task = task_seq_get_next(ns, &curr_tid);
if (!curr_task)
return NULL;
curr_files = get_files_struct(curr_task);
if (!curr_files) {
put_task_struct(curr_task);
curr_tid = ++(info->tid);
info->fd = 0;
goto again;
}
/* set *fstruct, *task and info->tid */
*fstruct = curr_files;
*task = curr_task;
if (curr_tid == info->tid) {
curr_fd = info->fd;
} else {
info->tid = curr_tid;
curr_fd = 0;
}
}
rcu_read_lock();
max_fds = files_fdtable(curr_files)->max_fds;
for (; curr_fd < max_fds; curr_fd++) {
struct file *f;
f = fcheck_files(curr_files, curr_fd);
if (!f)
continue;
/* set info->fd */
info->fd = curr_fd;
get_file(f);
rcu_read_unlock();
return f;
}
/* the current task is done, go to the next task */
rcu_read_unlock();
put_files_struct(curr_files);
put_task_struct(curr_task);
*task = NULL;
*fstruct = NULL;
info->fd = 0;
curr_tid = ++(info->tid);
goto again;
}
static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
struct files_struct *files = NULL;
struct task_struct *task = NULL;
struct file *file;
file = task_file_seq_get_next(info, &task, &files);
if (!file) {
info->files = NULL;
info->task = NULL;
return NULL;
}
++*pos;
info->task = task;
info->files = files;
return file;
}
static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
struct files_struct *files = info->files;
struct task_struct *task = info->task;
struct file *file;
++*pos;
++info->fd;
fput((struct file *)v);
file = task_file_seq_get_next(info, &task, &files);
if (!file) {
info->files = NULL;
info->task = NULL;
return NULL;
}
info->task = task;
info->files = files;
return file;
}
struct bpf_iter__task_file {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct task_struct *, task);
u32 fd __aligned(8);
__bpf_md_ptr(struct file *, file);
};
DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
struct task_struct *task, u32 fd,
struct file *file)
static int __task_file_seq_show(struct seq_file *seq, struct file *file,
bool in_stop)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
struct bpf_iter__task_file ctx;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, in_stop);
if (!prog)
return 0;
ctx.meta = &meta;
ctx.task = info->task;
ctx.fd = info->fd;
ctx.file = file;
return bpf_iter_run_prog(prog, &ctx);
}
static int task_file_seq_show(struct seq_file *seq, void *v)
{
return __task_file_seq_show(seq, v, false);
}
static void task_file_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
if (!v) {
(void)__task_file_seq_show(seq, v, true);
} else {
fput((struct file *)v);
put_files_struct(info->files);
put_task_struct(info->task);
info->files = NULL;
info->task = NULL;
}
}
static int init_seq_pidns(void *priv_data)
{
struct bpf_iter_seq_task_common *common = priv_data;
common->ns = get_pid_ns(task_active_pid_ns(current));
return 0;
}
static void fini_seq_pidns(void *priv_data)
{
struct bpf_iter_seq_task_common *common = priv_data;
put_pid_ns(common->ns);
}
static const struct seq_operations task_file_seq_ops = {
.start = task_file_seq_start,
.next = task_file_seq_next,
.stop = task_file_seq_stop,
.show = task_file_seq_show,
};
static const struct bpf_iter_reg task_reg_info = {
.target = "task",
.seq_ops = &task_seq_ops,
.init_seq_private = init_seq_pidns,
.fini_seq_private = fini_seq_pidns,
.seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task, task),
PTR_TO_BTF_ID_OR_NULL },
},
};
static const struct bpf_iter_reg task_file_reg_info = {
.target = "task_file",
.seq_ops = &task_file_seq_ops,
.init_seq_private = init_seq_pidns,
.fini_seq_private = fini_seq_pidns,
.seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info),
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task_file, task),
PTR_TO_BTF_ID_OR_NULL },
{ offsetof(struct bpf_iter__task_file, file),
PTR_TO_BTF_ID_OR_NULL },
},
};
static int __init task_iter_init(void)
{
int ret;
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
return bpf_iter_reg_target(&task_file_reg_info);
}
late_initcall(task_iter_init);

View File

@ -398,7 +398,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
return type == PTR_TO_MAP_VALUE_OR_NULL ||
type == PTR_TO_SOCKET_OR_NULL ||
type == PTR_TO_SOCK_COMMON_OR_NULL ||
type == PTR_TO_TCP_SOCK_OR_NULL;
type == PTR_TO_TCP_SOCK_OR_NULL ||
type == PTR_TO_BTF_ID_OR_NULL;
}
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@ -483,6 +484,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_TP_BUFFER] = "tp_buffer",
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
[PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
};
static char slot_type_char[] = {
@ -543,7 +545,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
if (t == PTR_TO_BTF_ID)
if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
verbose(env, "%s", kernel_type_name(reg->btf_id));
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
@ -2139,6 +2141,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID_OR_NULL:
return true;
default:
return false;
@ -2659,7 +2662,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
if (*reg_type == PTR_TO_BTF_ID)
if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
*btf_id = info.btf_id;
else
env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
@ -3243,7 +3246,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
if (reg_type == PTR_TO_BTF_ID)
if (reg_type == PTR_TO_BTF_ID ||
reg_type == PTR_TO_BTF_ID_OR_NULL)
regs[value_regno].btf_id = btf_id;
}
regs[value_regno].type = reg_type;
@ -3490,6 +3494,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
*stype = STACK_MISC;
goto mark;
}
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
goto mark;
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
@ -6572,6 +6581,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
reg->type = PTR_TO_SOCK_COMMON;
} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
reg->type = PTR_TO_TCP_SOCK;
} else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
reg->type = PTR_TO_BTF_ID;
}
if (is_null) {
/* We don't need id and ref_obj_id from this point
@ -7101,6 +7112,10 @@ static int check_return_code(struct bpf_verifier_env *env)
return 0;
range = tnum_const(0);
break;
case BPF_PROG_TYPE_TRACING:
if (env->prog->expected_attach_type != BPF_TRACE_ITER)
return 0;
break;
default:
return 0;
}
@ -8425,6 +8440,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID_OR_NULL:
return false;
default:
return true;
@ -10481,6 +10497,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
struct bpf_prog *tgt_prog = prog->aux->linked_prog;
u32 btf_id = prog->aux->attach_btf_id;
const char prefix[] = "btf_trace_";
struct btf_func_model fmodel;
int ret = 0, subprog = -1, i;
struct bpf_trampoline *tr;
const struct btf_type *t;
@ -10622,6 +10639,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_func_proto = t;
prog->aux->attach_btf_trace = true;
return 0;
case BPF_TRACE_ITER:
if (!btf_type_is_func(t)) {
verbose(env, "attach_btf_id %u is not a function\n",
btf_id);
return -EINVAL;
}
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
return -EINVAL;
prog->aux->attach_func_name = tname;
prog->aux->attach_func_proto = t;
if (!bpf_iter_prog_supported(prog))
return -EINVAL;
ret = btf_distill_func_proto(&env->log, btf, t,
tname, &fmodel);
return ret;
default:
if (!prog_extension)
return -EINVAL;

View File

@ -201,7 +201,7 @@ static int max_extfrag_threshold = 1000;
#endif /* CONFIG_SYSCTL */
#ifdef CONFIG_BPF_SYSCALL
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
static int bpf_stats_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)

View File

@ -457,6 +457,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
#define MAX_SEQ_PRINTF_VARARGS 12
#define MAX_SEQ_PRINTF_MAX_MEMCPY 6
#define MAX_SEQ_PRINTF_STR_LEN 128
struct bpf_seq_printf_buf {
char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
};
static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
const void *, data, u32, data_len)
{
int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
int i, buf_used, copy_size, num_args;
u64 params[MAX_SEQ_PRINTF_VARARGS];
struct bpf_seq_printf_buf *bufs;
const u64 *args = data;
buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
if (WARN_ON_ONCE(buf_used > 1)) {
err = -EBUSY;
goto out;
}
bufs = this_cpu_ptr(&bpf_seq_printf_buf);
/*
* bpf_check()->check_func_arg()->check_stack_boundary()
* guarantees that fmt points to bpf program stack,
* fmt_size bytes of it were initialized and fmt_size > 0
*/
if (fmt[--fmt_size] != 0)
goto out;
if (data_len & 7)
goto out;
for (i = 0; i < fmt_size; i++) {
if (fmt[i] == '%') {
if (fmt[i + 1] == '%')
i++;
else if (!data || !data_len)
goto out;
}
}
num_args = data_len / 8;
/* check format string for allowed specifiers */
for (i = 0; i < fmt_size; i++) {
/* only printable ascii for now. */
if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
err = -EINVAL;
goto out;
}
if (fmt[i] != '%')
continue;
if (fmt[i + 1] == '%') {
i++;
continue;
}
if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
err = -E2BIG;
goto out;
}
if (fmt_cnt >= num_args) {
err = -EINVAL;
goto out;
}
/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
i++;
/* skip optional "[0 +-][num]" width formating field */
while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
fmt[i] == ' ')
i++;
if (fmt[i] >= '1' && fmt[i] <= '9') {
i++;
while (fmt[i] >= '0' && fmt[i] <= '9')
i++;
}
if (fmt[i] == 's') {
/* try our best to copy */
if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
err = -E2BIG;
goto out;
}
err = strncpy_from_unsafe(bufs->buf[memcpy_cnt],
(void *) (long) args[fmt_cnt],
MAX_SEQ_PRINTF_STR_LEN);
if (err < 0)
bufs->buf[memcpy_cnt][0] = '\0';
params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
fmt_cnt++;
memcpy_cnt++;
continue;
}
if (fmt[i] == 'p') {
if (fmt[i + 1] == 0 ||
fmt[i + 1] == 'K' ||
fmt[i + 1] == 'x') {
/* just kernel pointers */
params[fmt_cnt] = args[fmt_cnt];
fmt_cnt++;
continue;
}
/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') {
err = -EINVAL;
goto out;
}
if (fmt[i + 2] != '4' && fmt[i + 2] != '6') {
err = -EINVAL;
goto out;
}
if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
err = -E2BIG;
goto out;
}
copy_size = (fmt[i + 2] == '4') ? 4 : 16;
err = probe_kernel_read(bufs->buf[memcpy_cnt],
(void *) (long) args[fmt_cnt],
copy_size);
if (err < 0)
memset(bufs->buf[memcpy_cnt], 0, copy_size);
params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
i += 2;
fmt_cnt++;
memcpy_cnt++;
continue;
}
if (fmt[i] == 'l') {
i++;
if (fmt[i] == 'l')
i++;
}
if (fmt[i] != 'i' && fmt[i] != 'd' &&
fmt[i] != 'u' && fmt[i] != 'x') {
err = -EINVAL;
goto out;
}
params[fmt_cnt] = args[fmt_cnt];
fmt_cnt++;
}
/* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
* all of them to seq_printf().
*/
seq_printf(m, fmt, params[0], params[1], params[2], params[3],
params[4], params[5], params[6], params[7], params[8],
params[9], params[10], params[11]);
err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
out:
this_cpu_dec(bpf_seq_printf_buf_used);
return err;
}
static int bpf_seq_printf_btf_ids[5];
static const struct bpf_func_proto bpf_seq_printf_proto = {
.func = bpf_seq_printf,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_PTR_TO_MEM_OR_NULL,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
.btf_id = bpf_seq_printf_btf_ids,
};
BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
{
return seq_write(m, data, len) ? -EOVERFLOW : 0;
}
static int bpf_seq_write_btf_ids[5];
static const struct bpf_func_proto bpf_seq_write_proto = {
.func = bpf_seq_write,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.btf_id = bpf_seq_write_btf_ids,
};
static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags,
u64 *value, u64 *enabled, u64 *running)
@ -1226,6 +1432,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_xdp_output:
return &bpf_xdp_output_proto;
#endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
&bpf_seq_printf_proto :
NULL;
case BPF_FUNC_seq_write:
return prog->expected_attach_type == BPF_TRACE_ITER ?
&bpf_seq_write_proto :
NULL;
default:
return raw_tp_prog_func_proto(func_id, prog);
}

View File

@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
};
#ifdef CONFIG_SOCK_CGROUP_DATA
static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
struct cgroup *cgrp;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return cgroup_id(cgrp);
}
BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{
struct sock *sk = skb_to_full_sk(skb);
struct cgroup *cgrp;
if (!sk || !sk_fullsock(sk))
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return cgroup_id(cgrp);
return __bpf_sk_cgroup_id(sk);
}
static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
ancestor_level)
static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
int ancestor_level)
{
struct sock *sk = skb_to_full_sk(skb);
struct cgroup *ancestor;
struct cgroup *cgrp;
if (!sk || !sk_fullsock(sk))
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
ancestor = cgroup_ancestor(cgrp, ancestor_level);
if (!ancestor)
@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
return cgroup_id(ancestor);
}
BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
ancestor_level)
{
struct sock *sk = skb_to_full_sk(skb);
if (!sk || !sk_fullsock(sk))
return 0;
return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}
static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.func = bpf_skb_ancestor_cgroup_id,
.gpl_only = false,
@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
{
return __bpf_sk_cgroup_id(sk);
}
static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
.func = bpf_sk_cgroup_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_SOCKET,
};
BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
{
return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}
static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
.func = bpf_sk_ancestor_cgroup_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_SOCKET,
.arg2_type = ARG_ANYTHING,
};
#endif
static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
@ -4525,30 +4563,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
{
#ifdef CONFIG_INET
struct sock *sk = ctx->sk;
u32 flags = BIND_FROM_BPF;
int err;
/* Binding to port can be expensive so it's prohibited in the helper.
* Only binding to IP is supported.
*/
err = -EINVAL;
if (addr_len < offsetofend(struct sockaddr, sa_family))
return err;
if (addr->sa_family == AF_INET) {
if (addr_len < sizeof(struct sockaddr_in))
return err;
if (((struct sockaddr_in *)addr)->sin_port != htons(0))
return err;
return __inet_bind(sk, addr, addr_len, true, false);
if (((struct sockaddr_in *)addr)->sin_port == htons(0))
flags |= BIND_FORCE_ADDRESS_NO_PORT;
return __inet_bind(sk, addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
} else if (addr->sa_family == AF_INET6) {
if (addr_len < SIN6_LEN_RFC2133)
return err;
if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
return err;
if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
flags |= BIND_FORCE_ADDRESS_NO_PORT;
/* ipv6_bpf_stub cannot be NULL, since it's called from
* bpf_cgroup_inet6_connect hook and ipv6 is already loaded
*/
return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false);
return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
#endif /* CONFIG_IPV6 */
}
#endif /* CONFIG_INET */
@ -6159,8 +6195,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto;
case BPF_FUNC_skb_ancestor_cgroup_id:
return &bpf_skb_ancestor_cgroup_id_proto;
case BPF_FUNC_sk_cgroup_id:
return &bpf_sk_cgroup_id_proto;
case BPF_FUNC_sk_ancestor_cgroup_id:
return &bpf_sk_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sk_lookup_tcp_proto;
case BPF_FUNC_sk_lookup_udp:
return &bpf_sk_lookup_udp_proto;
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
case BPF_FUNC_skc_lookup_tcp:
return &bpf_skc_lookup_tcp_proto;
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
case BPF_FUNC_get_listener_sock:
@ -7031,6 +7081,7 @@ static bool sock_addr_is_valid_access(int off, int size,
case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
msg_src_ip6[3]):
case bpf_ctx_range(struct bpf_sock_addr, user_port):
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
@ -7061,10 +7112,6 @@ static bool sock_addr_is_valid_access(int off, int size,
return false;
}
break;
case bpf_ctx_range(struct bpf_sock_addr, user_port):
if (size != size_default)
return false;
break;
case offsetof(struct bpf_sock_addr, sk):
if (type != BPF_READ)
return false;
@ -7960,8 +8007,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
struct bpf_insn *insn = insn_buf;
int off;
switch (si->off) {
case offsetof(struct bpf_sock_addr, user_family):
@ -7996,9 +8043,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
offsetof(struct sockaddr_in6, sin6_port));
BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
sizeof_field(struct sockaddr_in6, sin6_port));
SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern,
struct sockaddr_in6, uaddr,
sin6_port, tmp_reg);
/* Account for sin6_port being smaller than user_port. */
port_size = min(port_size, BPF_LDST_BYTES(si));
SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
break;
case offsetof(struct bpf_sock_addr, family):

View File

@ -450,12 +450,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (err)
return err;
return __inet_bind(sk, uaddr, addr_len, false, true);
return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
}
EXPORT_SYMBOL(inet_bind);
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
bool force_bind_address_no_port, bool with_lock)
u32 flags)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
@ -506,7 +506,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
if (with_lock)
if (flags & BIND_WITH_LOCK)
lock_sock(sk);
/* Check these errors (active socket, double bind). */
@ -520,16 +520,18 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Make sure we are allowed to bind here. */
if (snum || !(inet->bind_address_no_port ||
force_bind_address_no_port)) {
(flags & BIND_FORCE_ADDRESS_NO_PORT))) {
if (sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
goto out_release_sock;
if (!(flags & BIND_FROM_BPF)) {
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
goto out_release_sock;
}
}
}
@ -543,7 +545,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
sk_dst_reset(sk);
err = 0;
out_release_sock:
if (with_lock)
if (flags & BIND_WITH_LOCK)
release_sock(sk);
out:
return err;

View File

@ -273,7 +273,7 @@ out_rcu_unlock:
}
static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
bool force_bind_address_no_port, bool with_lock)
u32 flags)
{
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
struct inet_sock *inet = inet_sk(sk);
@ -297,7 +297,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
return -EACCES;
if (with_lock)
if (flags & BIND_WITH_LOCK)
lock_sock(sk);
/* Check these errors (active socket, double bind). */
@ -400,18 +400,20 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Make sure we are allowed to bind here. */
if (snum || !(inet->bind_address_no_port ||
force_bind_address_no_port)) {
(flags & BIND_FORCE_ADDRESS_NO_PORT))) {
if (sk->sk_prot->get_port(sk, snum)) {
sk->sk_ipv6only = saved_ipv6only;
inet_reset_saddr(sk);
err = -EADDRINUSE;
goto out;
}
err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
if (err) {
sk->sk_ipv6only = saved_ipv6only;
inet_reset_saddr(sk);
goto out;
if (!(flags & BIND_FROM_BPF)) {
err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
if (err) {
sk->sk_ipv6only = saved_ipv6only;
inet_reset_saddr(sk);
goto out;
}
}
}
@ -423,7 +425,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
inet->inet_dport = 0;
inet->inet_daddr = 0;
out:
if (with_lock)
if (flags & BIND_WITH_LOCK)
release_sock(sk);
return err;
out_unlock:
@ -451,7 +453,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (err)
return err;
return __inet6_bind(sk, uaddr, addr_len, false, true);
return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
}
EXPORT_SYMBOL(inet6_bind);

View File

@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void)
}
#ifdef CONFIG_PROC_FS
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
{
struct fib6_info *rt = v;
struct ipv6_route_iter *iter = seq->private;
@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
return w->node && !(w->state == FWS_U && w->node == w->root);
}
static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
__releases(RCU_BH)
{
struct net *net = seq_file_net(seq);
@ -2637,6 +2637,62 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
rcu_read_unlock_bh();
}
#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
struct bpf_iter_meta *meta,
void *v)
{
struct bpf_iter__ipv6_route ctx;
ctx.meta = meta;
ctx.rt = v;
return bpf_iter_run_prog(prog, &ctx);
}
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
struct ipv6_route_iter *iter = seq->private;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int ret;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
if (!prog)
return ipv6_route_native_seq_show(seq, v);
ret = ipv6_route_prog_seq_show(prog, &meta, v);
iter->w.leaf = NULL;
return ret;
}
static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
if (!v) {
meta.seq = seq;
prog = bpf_iter_get_info(&meta, true);
if (prog)
(void)ipv6_route_prog_seq_show(prog, &meta, v);
}
ipv6_route_native_seq_stop(seq, v);
}
#else
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
return ipv6_route_native_seq_show(seq, v);
}
static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
ipv6_route_native_seq_stop(seq, v);
}
#endif
const struct seq_operations ipv6_route_seq_ops = {
.start = ipv6_route_seq_start,
.next = ipv6_route_seq_next,

View File

@ -6421,6 +6421,35 @@ void __init ip6_route_init_special_entries(void)
#endif
}
#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
static const struct bpf_iter_reg ipv6_route_reg_info = {
.target = "ipv6_route",
.seq_ops = &ipv6_route_seq_ops,
.init_seq_private = bpf_iter_init_seq_net,
.fini_seq_private = bpf_iter_fini_seq_net,
.seq_priv_size = sizeof(struct ipv6_route_iter),
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__ipv6_route, rt),
PTR_TO_BTF_ID_OR_NULL },
},
};
static int __init bpf_iter_register(void)
{
return bpf_iter_reg_target(&ipv6_route_reg_info);
}
static void bpf_iter_unregister(void)
{
bpf_iter_unreg_target(&ipv6_route_reg_info);
}
#endif
#endif
int __init ip6_route_init(void)
{
int ret;
@ -6483,6 +6512,14 @@ int __init ip6_route_init(void)
if (ret)
goto out_register_late_subsys;
#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
ret = bpf_iter_register();
if (ret)
goto out_register_late_subsys;
#endif
#endif
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
@ -6515,6 +6552,11 @@ out_kmem_cache:
void ip6_route_cleanup(void)
{
#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
bpf_iter_unregister();
#endif
#endif
unregister_netdevice_notifier(&ip6_route_dev_notifier);
unregister_pernet_subsys(&ip6_route_net_late_ops);
fib6_rules_cleanup();

View File

@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return __netlink_seq_next(seq);
}
static void netlink_seq_stop(struct seq_file *seq, void *v)
static void netlink_native_seq_stop(struct seq_file *seq, void *v)
{
struct nl_seq_iter *iter = seq->private;
@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v)
}
static int netlink_seq_show(struct seq_file *seq, void *v)
static int netlink_native_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
return 0;
}
#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__netlink {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct netlink_sock *, sk);
};
DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)
static int netlink_prog_seq_show(struct bpf_prog *prog,
struct bpf_iter_meta *meta,
void *v)
{
struct bpf_iter__netlink ctx;
meta->seq_num--; /* skip SEQ_START_TOKEN */
ctx.meta = meta;
ctx.sk = nlk_sk((struct sock *)v);
return bpf_iter_run_prog(prog, &ctx);
}
static int netlink_seq_show(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
if (!prog)
return netlink_native_seq_show(seq, v);
if (v != SEQ_START_TOKEN)
return netlink_prog_seq_show(prog, &meta, v);
return 0;
}
static void netlink_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
if (!v) {
meta.seq = seq;
prog = bpf_iter_get_info(&meta, true);
if (prog)
(void)netlink_prog_seq_show(prog, &meta, v);
}
netlink_native_seq_stop(seq, v);
}
#else
static int netlink_seq_show(struct seq_file *seq, void *v)
{
return netlink_native_seq_show(seq, v);
}
static void netlink_seq_stop(struct seq_file *seq, void *v)
{
netlink_native_seq_stop(seq, v);
}
#endif
static const struct seq_operations netlink_seq_ops = {
.start = netlink_seq_start,
.next = netlink_seq_next,
@ -2740,6 +2802,26 @@ static const struct rhashtable_params netlink_rhashtable_params = {
.automatic_shrinking = true,
};
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
static const struct bpf_iter_reg netlink_reg_info = {
.target = "netlink",
.seq_ops = &netlink_seq_ops,
.init_seq_private = bpf_iter_init_seq_net,
.fini_seq_private = bpf_iter_fini_seq_net,
.seq_priv_size = sizeof(struct nl_seq_iter),
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__netlink, sk),
PTR_TO_BTF_ID_OR_NULL },
},
};
static int __init bpf_iter_register(void)
{
return bpf_iter_reg_target(&netlink_reg_info);
}
#endif
static int __init netlink_proto_init(void)
{
int i;
@ -2748,6 +2830,12 @@ static int __init netlink_proto_init(void)
if (err != 0)
goto out;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
err = bpf_iter_register();
if (err)
goto out;
#endif
BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));
nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);

View File

@ -30,9 +30,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
if (!xs->tx)
return;
spin_lock_irqsave(&umem->xsk_list_lock, flags);
list_add_rcu(&xs->list, &umem->xsk_list);
spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
list_add_rcu(&xs->list, &umem->xsk_tx_list);
spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
}
void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
@ -42,9 +42,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
if (!xs->tx)
return;
spin_lock_irqsave(&umem->xsk_list_lock, flags);
spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
list_del_rcu(&xs->list);
spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
}
/* The umem is stored both in the _rx struct and the _tx struct as we do
@ -279,7 +279,7 @@ void xdp_put_umem(struct xdp_umem *umem)
}
}
static int xdp_umem_pin_pages(struct xdp_umem *umem)
static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
{
unsigned int gup_flags = FOLL_WRITE;
long npgs;
@ -291,7 +291,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
return -ENOMEM;
down_read(&current->mm->mmap_sem);
npgs = pin_user_pages(umem->address, umem->npgs,
npgs = pin_user_pages(address, umem->npgs,
gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
up_read(&current->mm->mmap_sem);
@ -385,7 +385,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
return -EINVAL;
umem->address = (unsigned long)addr;
umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
: ~((u64)chunk_size - 1);
umem->size = size;
@ -395,8 +394,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
umem->pgs = NULL;
umem->user = NULL;
umem->flags = mr->flags;
INIT_LIST_HEAD(&umem->xsk_list);
spin_lock_init(&umem->xsk_list_lock);
INIT_LIST_HEAD(&umem->xsk_tx_list);
spin_lock_init(&umem->xsk_tx_list_lock);
refcount_set(&umem->users, 1);
@ -404,7 +403,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (err)
return err;
err = xdp_umem_pin_pages(umem);
err = xdp_umem_pin_pages(umem, (unsigned long)addr);
if (err)
goto out_account;

View File

@ -75,7 +75,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
return;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
@ -102,7 +102,7 @@ void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
return;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
@ -305,7 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
struct xdp_sock *xs;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
__xskq_cons_release(xs->tx);
xs->sk.sk_write_space(&xs->sk);
}
@ -318,7 +318,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
struct xdp_sock *xs;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
if (!xskq_cons_peek_desc(xs->tx, desc, umem))
continue;

View File

@ -9,12 +9,12 @@
#include "xsk_queue.h"
void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask)
void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask)
{
if (!q)
return;
q->size = size;
q->umem_size = umem_size;
q->chunk_mask = chunk_mask;
}

View File

@ -30,7 +30,7 @@ struct xdp_umem_ring {
struct xsk_queue {
u64 chunk_mask;
u64 size;
u64 umem_size;
u32 ring_mask;
u32 nentries;
u32 cached_prod;
@ -123,7 +123,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
u64 base_addr = xsk_umem_extract_addr(addr);
addr = xsk_umem_add_offset_to_addr(addr);
if (base_addr >= q->size || addr >= q->size ||
if (base_addr >= q->umem_size || addr >= q->umem_size ||
xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
q->invalid_descs++;
return false;
@ -134,7 +134,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
{
if (addr >= q->size) {
if (addr >= q->umem_size) {
q->invalid_descs++;
return false;
}
@ -379,7 +379,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
return q ? q->invalid_descs : 0;
}
void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask);
struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
void xskq_destroy(struct xsk_queue *q_ops);

View File

@ -5,12 +5,12 @@
* License as published by the Free Software Foundation.
*/
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <uapi/linux/ptrace.h>
#include <uapi/linux/perf_event.h>
#include <linux/version.h>
#include <linux/sched.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;})

View File

@ -1,12 +1,12 @@
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "bpf_legacy.h"
#include <uapi/linux/in.h>
#include <uapi/linux/if.h>
#include <uapi/linux/if_ether.h>
#include <uapi/linux/ip.h>
#include <uapi/linux/ipv6.h>
#include <uapi/linux/if_tunnel.h>
#include <bpf/bpf_helpers.h>
#include "bpf_legacy.h"
#define IP_MF 0x2000
#define IP_OFFSET 0x1FFF

View File

@ -5,8 +5,6 @@
* License as published by the Free Software Foundation.
*/
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "bpf_legacy.h"
#include <uapi/linux/in.h>
#include <uapi/linux/if.h>
#include <uapi/linux/if_ether.h>
@ -14,6 +12,8 @@
#include <uapi/linux/ipv6.h>
#include <uapi/linux/if_tunnel.h>
#include <uapi/linux/mpls.h>
#include <bpf/bpf_helpers.h>
#include "bpf_legacy.h"
#define IP_MF 0x2000
#define IP_OFFSET 0x1FFF

View File

@ -15,7 +15,7 @@
#include <bpf/bpf_helpers.h>
#include "hash_func01.h"
#define MAX_CPUS 64 /* WARNING - sync with _user.c */
#define MAX_CPUS NR_CPUS
/* Special map type that can XDP_REDIRECT frames to another CPU */
struct {

View File

@ -13,6 +13,7 @@ static const char *__doc__ =
#include <unistd.h>
#include <locale.h>
#include <sys/resource.h>
#include <sys/sysinfo.h>
#include <getopt.h>
#include <net/if.h>
#include <time.h>
@ -24,8 +25,6 @@ static const char *__doc__ =
#include <arpa/inet.h>
#include <linux/if_link.h>
#define MAX_CPUS 64 /* WARNING - sync with _kern.c */
/* How many xdp_progs are defined in _kern.c */
#define MAX_PROG 6
@ -40,6 +39,7 @@ static char *ifname;
static __u32 prog_id;
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
static int n_cpus;
static int cpu_map_fd;
static int rx_cnt_map_fd;
static int redirect_err_cnt_map_fd;
@ -170,7 +170,7 @@ struct stats_record {
struct record redir_err;
struct record kthread;
struct record exception;
struct record enq[MAX_CPUS];
struct record enq[];
};
static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
@ -225,10 +225,11 @@ static struct datarec *alloc_record_per_cpu(void)
static struct stats_record *alloc_stats_record(void)
{
struct stats_record *rec;
int i;
int i, size;
rec = malloc(sizeof(*rec));
memset(rec, 0, sizeof(*rec));
size = sizeof(*rec) + n_cpus * sizeof(struct record);
rec = malloc(size);
memset(rec, 0, size);
if (!rec) {
fprintf(stderr, "Mem alloc error\n");
exit(EXIT_FAIL_MEM);
@ -237,7 +238,7 @@ static struct stats_record *alloc_stats_record(void)
rec->redir_err.cpu = alloc_record_per_cpu();
rec->kthread.cpu = alloc_record_per_cpu();
rec->exception.cpu = alloc_record_per_cpu();
for (i = 0; i < MAX_CPUS; i++)
for (i = 0; i < n_cpus; i++)
rec->enq[i].cpu = alloc_record_per_cpu();
return rec;
@ -247,7 +248,7 @@ static void free_stats_record(struct stats_record *r)
{
int i;
for (i = 0; i < MAX_CPUS; i++)
for (i = 0; i < n_cpus; i++)
free(r->enq[i].cpu);
free(r->exception.cpu);
free(r->kthread.cpu);
@ -350,7 +351,7 @@ static void stats_print(struct stats_record *stats_rec,
}
/* cpumap enqueue stats */
for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
for (to_cpu = 0; to_cpu < n_cpus; to_cpu++) {
char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
char *errstr = "";
@ -475,7 +476,7 @@ static void stats_collect(struct stats_record *rec)
map_collect_percpu(fd, 1, &rec->redir_err);
fd = cpumap_enqueue_cnt_map_fd;
for (i = 0; i < MAX_CPUS; i++)
for (i = 0; i < n_cpus; i++)
map_collect_percpu(fd, i, &rec->enq[i]);
fd = cpumap_kthread_cnt_map_fd;
@ -549,10 +550,10 @@ static int create_cpu_entry(__u32 cpu, __u32 queue_size,
*/
static void mark_cpus_unavailable(void)
{
__u32 invalid_cpu = MAX_CPUS;
__u32 invalid_cpu = n_cpus;
int ret, i;
for (i = 0; i < MAX_CPUS; i++) {
for (i = 0; i < n_cpus; i++) {
ret = bpf_map_update_elem(cpus_available_map_fd, &i,
&invalid_cpu, 0);
if (ret) {
@ -688,6 +689,8 @@ int main(int argc, char **argv)
int prog_fd;
__u32 qsize;
n_cpus = get_nprocs_conf();
/* Notice: choosing he queue size is very important with the
* ixgbe driver, because it's driver page recycling trick is
* dependend on pages being returned quickly. The number of
@ -757,7 +760,7 @@ int main(int argc, char **argv)
case 'c':
/* Add multiple CPUs */
add_cpu = strtoul(optarg, NULL, 0);
if (add_cpu >= MAX_CPUS) {
if (add_cpu >= n_cpus) {
fprintf(stderr,
"--cpu nr too large for cpumap err(%d):%s\n",
errno, strerror(errno));

View File

@ -318,6 +318,11 @@ may be interested in:
of eBPF maps are used with a given helper function.
* *kernel/bpf/* directory contains other files in which additional helpers are
defined (for cgroups, sockmaps, etc.).
* The bpftool utility can be used to probe the availability of helper functions
on the system (as well as supported program and map types, and a number of
other parameters). To do so, run **bpftool feature probe** (see
**bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to
list features available to unprivileged users.
Compatibility between helper functions and program types can generally be found
in the files where helper functions are defined. Look for the **struct
@ -338,6 +343,7 @@ SEE ALSO
========
**bpf**\ (2),
**bpftool**\ (8),
**cgroups**\ (7),
**ip**\ (8),
**perf_event_open**\ (2),
@ -414,6 +420,7 @@ class PrinterHelpers(Printer):
'struct sk_reuseport_md',
'struct sockaddr',
'struct tcphdr',
'struct seq_file',
'struct __sk_buff',
'struct sk_msg_md',
@ -450,6 +457,7 @@ class PrinterHelpers(Printer):
'struct sk_reuseport_md',
'struct sockaddr',
'struct tcphdr',
'struct seq_file',
}
mapped_types = {
'u8': '__u8',

View File

@ -27,9 +27,9 @@
"audit_control", "setfcap"
#define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \
"wake_alarm", "block_suspend", "audit_read"
"wake_alarm", "block_suspend", "audit_read", "perfmon"
#if CAP_LAST_CAP > CAP_AUDIT_READ
#if CAP_LAST_CAP > CAP_PERFMON
#error New capability defined, please update COMMON_CAP2_PERMS.
#endif

View File

@ -230,9 +230,14 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-map**\ (8),
**bpftool-prog**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8)
**bpftool-perf**\ (8),
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -20,7 +20,7 @@ SYNOPSIS
CGROUP COMMANDS
===============
| **bpftool** **cgroup { show | list }** *CGROUP* [**effective**]
| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**]
| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**]
| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*]
| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
@ -160,9 +160,13 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-prog**\ (8),
**bpftool-map**\ (8),
**bpftool-btf**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -28,7 +28,7 @@ DESCRIPTION
===========
**bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]]
Probe the running kernel and dump a number of eBPF-related
parameters, such as availability of the **bpf()** system call,
parameters, such as availability of the **bpf**\ () system call,
JIT status, eBPF program types availability, eBPF helper
functions availability, and more.
@ -93,9 +93,13 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-prog**\ (8),
**bpftool-map**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -14,7 +14,7 @@ SYNOPSIS
*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] }
*COMMAND* := { **skeleton | **help** }
*COMMAND* := { **skeleton** | **help** }
GEN COMMANDS
=============
@ -36,12 +36,12 @@ DESCRIPTION
etc. Skeleton eliminates the need to lookup mentioned
components by name. Instead, if skeleton instantiation
succeeds, they are populated in skeleton structure as valid
libbpf types (e.g., struct bpf_map pointer) and can be
libbpf types (e.g., **struct bpf_map** pointer) and can be
passed to existing generic libbpf APIs.
In addition to simple and reliable access to maps and
programs, skeleton provides a storage for BPF links (struct
bpf_link) for each BPF program within BPF object. When
programs, skeleton provides a storage for BPF links (**struct
bpf_link**) for each BPF program within BPF object. When
requested, supported BPF programs will be automatically
attached and resulting BPF links stored for further use by
user in pre-allocated fields in skeleton struct. For BPF
@ -82,14 +82,14 @@ DESCRIPTION
- **example__open** and **example__open_opts**.
These functions are used to instantiate skeleton. It
corresponds to libbpf's **bpf_object__open()** API.
corresponds to libbpf's **bpf_object__open**\ () API.
**_opts** variants accepts extra **bpf_object_open_opts**
options.
- **example__load**.
This function creates maps, loads and verifies BPF
programs, initializes global data maps. It corresponds to
libppf's **bpf_object__load** API.
libppf's **bpf_object__load**\ () API.
- **example__open_and_load** combines **example__open** and
**example__load** invocations in one commonly used
@ -296,10 +296,13 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-map**\ (8),
**bpftool-prog**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -0,0 +1,81 @@
============
bpftool-iter
============
-------------------------------------------------------------------------------
tool to create BPF iterators
-------------------------------------------------------------------------------
:Manual section: 8
SYNOPSIS
========
**bpftool** [*OPTIONS*] **iter** *COMMAND*
*COMMANDS* := { **pin** | **help** }
ITER COMMANDS
===================
| **bpftool** **iter pin** *OBJ* *PATH*
| **bpftool** **iter help**
|
| *OBJ* := /a/file/of/bpf_iter_target.o
DESCRIPTION
===========
**bpftool iter pin** *OBJ* *PATH*
A bpf iterator combines a kernel iterating of
particular kernel data (e.g., tasks, bpf_maps, etc.)
and a bpf program called for each kernel data object
(e.g., one task, one bpf_map, etc.). User space can
*read* kernel iterator output through *read()* syscall.
The *pin* command creates a bpf iterator from *OBJ*,
and pin it to *PATH*. The *PATH* should be located
in *bpffs* mount. It must not contain a dot
character ('.'), which is reserved for future extensions
of *bpffs*.
User can then *cat PATH* to see the bpf iterator output.
**bpftool iter help**
Print short help message.
OPTIONS
=======
-h, --help
Print short generic help message (similar to **bpftool help**).
-V, --version
Print version number (similar to **bpftool version**).
-d, --debug
Print all logs available, even debug-level information. This
includes logs from libbpf as well as from the verifier, when
attempting to load programs.
EXAMPLES
========
**# bpftool iter pin bpf_iter_netlink.o /sys/fs/bpf/my_netlink**
::
Create a file-based bpf iterator from bpf_iter_netlink.o and pin it
to /sys/fs/bpf/my_netlink
SEE ALSO
========
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -109,10 +109,13 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-prog\ (8),
**bpftool-map**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -21,7 +21,7 @@ SYNOPSIS
MAP COMMANDS
=============
| **bpftool** **map { show | list }** [*MAP*]
| **bpftool** **map** { **show** | **list** } [*MAP*]
| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \
| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*]
| **bpftool** **map dump** *MAP*
@ -49,7 +49,7 @@ MAP COMMANDS
| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps**
| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
| | **queue** | **stack** }
| | **queue** | **stack** | **sk_storage** | **struct_ops** }
DESCRIPTION
===========
@ -66,6 +66,13 @@ DESCRIPTION
Create a new map with given parameters and pin it to *bpffs*
as *FILE*.
*FLAGS* should be an integer which is the combination of
desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h
UAPI header for existing flags).
Keyword **dev** expects a network interface name, and is used
to request hardware offload for the map.
**bpftool map dump** *MAP*
Dump all entries in a given *MAP*. In case of **name**,
*MAP* may match several maps which will all be dumped.
@ -78,7 +85,7 @@ DESCRIPTION
exists; **noexist** update only if entry doesn't exist.
If the **hex** keyword is provided in front of the bytes
sequence, the bytes are parsed as hexadeximal values, even if
sequence, the bytes are parsed as hexadecimal values, even if
no "0x" prefix is added. If the keyword is not provided, then
the bytes are parsed as decimal values, unless a "0x" prefix
(for hexadecimal) or a "0" prefix (for octal) is provided.
@ -100,10 +107,10 @@ DESCRIPTION
extensions of *bpffs*.
**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map.
Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map.
Install perf rings into a perf event array map and dump
output of any bpf_perf_event_output() call in the kernel.
output of any **bpf_perf_event_output**\ () call in the kernel.
By default read the number of CPUs on the system and
install perf ring for each CPU in the corresponding index
in the array.
@ -116,24 +123,24 @@ DESCRIPTION
receiving events if it installed its rings earlier.
**bpftool map peek** *MAP*
Peek next **value** in the queue or stack.
Peek next value in the queue or stack.
**bpftool map push** *MAP* **value** *VALUE*
Push **value** onto the stack.
Push *VALUE* onto the stack.
**bpftool map pop** *MAP*
Pop and print **value** from the stack.
Pop and print value from the stack.
**bpftool map enqueue** *MAP* **value** *VALUE*
Enqueue **value** into the queue.
Enqueue *VALUE* into the queue.
**bpftool map dequeue** *MAP*
Dequeue and print **value** from the queue.
Dequeue and print value from the queue.
**bpftool map freeze** *MAP*
Freeze the map as read-only from user space. Entries from a
frozen map can not longer be updated or deleted with the
**bpf\ ()** system call. This operation is not reversible,
**bpf**\ () system call. This operation is not reversible,
and the map remains immutable from user space until its
destruction. However, read and write permissions for BPF
programs to the map remain unchanged.
@ -269,9 +276,13 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-prog**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -20,7 +20,7 @@ SYNOPSIS
NET COMMANDS
============
| **bpftool** **net { show | list }** [ **dev** *NAME* ]
| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ]
| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
| **bpftool** **net help**
@ -194,9 +194,13 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-prog**\ (8),
**bpftool-map**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -20,7 +20,7 @@ SYNOPSIS
PERF COMMANDS
=============
| **bpftool** **perf { show | list }**
| **bpftool** **perf** { **show** | **list** }
| **bpftool** **perf help**
DESCRIPTION
@ -85,9 +85,13 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-prog**\ (8),
**bpftool-map**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-btf**\ (8)
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -21,11 +21,11 @@ SYNOPSIS
PROG COMMANDS
=============
| **bpftool** **prog { show | list }** [*PROG*]
| **bpftool** **prog** { **show** | **list** } [*PROG*]
| **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** | **visual** | **linum**}]
| **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes** | **linum**}]
| **bpftool** **prog pin** *PROG* *FILE*
| **bpftool** **prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] [**pinmaps** *MAP_DIR*]
| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] [**pinmaps** *MAP_DIR*]
| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
| **bpftool** **prog tracelog**
@ -49,7 +49,7 @@ PROG COMMANDS
| *ATTACH_TYPE* := {
| **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector**
| }
| *METRIC* := {
| *METRICs* := {
| **cycles** | **instructions** | **l1d_loads** | **llc_misses**
| }
@ -155,7 +155,7 @@ DESCRIPTION
**bpftool prog tracelog**
Dump the trace pipe of the system to the console (stdout).
Hit <Ctrl+C> to stop printing. BPF programs can write to this
trace pipe at runtime with the **bpf_trace_printk()** helper.
trace pipe at runtime with the **bpf_trace_printk**\ () helper.
This should be used only for debugging purposes. For
streaming data from BPF programs to user space, one can use
perf events (see also **bpftool-map**\ (8)).
@ -195,9 +195,9 @@ DESCRIPTION
**bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs*
Profile *METRICs* for bpf program *PROG* for *DURATION*
seconds or until user hits Ctrl-C. *DURATION* is optional.
seconds or until user hits <Ctrl+C>. *DURATION* is optional.
If *DURATION* is not specified, the profiling will run up to
UINT_MAX seconds.
**UINT_MAX** seconds.
**bpftool prog help**
Print short help message.
@ -267,7 +267,7 @@ EXAMPLES
|
| **# bpftool prog dump xlated id 10 file /tmp/t**
| **# ls -l /tmp/t**
| **$ ls -l /tmp/t**
::
@ -325,6 +325,7 @@ EXAMPLES
| **# bpftool prog profile id 337 duration 10 cycles instructions llc_misses**
::
51397 run_cnt
40176203 cycles (83.05%)
42518139 instructions # 1.06 insns per cycle (83.39%)
@ -335,9 +336,13 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-map**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
**bpftool-struct_ops**\ (8)

View File

@ -105,12 +105,13 @@ SEE ALSO
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-prog**\ (8),
**bpftool-map**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
**bpftool-gen**\ (8)
**bpftool-prog**\ (8)

View File

@ -75,11 +75,14 @@ SEE ALSO
========
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool-prog**\ (8),
**bpftool-map**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8),
**bpftool-gen**\ (8),
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)

View File

@ -610,6 +610,19 @@ _bpftool()
;;
esac
;;
iter)
case $command in
pin)
_filedir
return 0
;;
*)
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'pin help' \
-- "$cur" ) )
;;
esac
;;
map)
local MAP_TYPE='id pinned name'
case $command in

View File

@ -271,8 +271,8 @@ static void btf_int128_print(json_writer_t *jw, const void *data,
}
}
static void btf_int128_shift(__u64 *print_num, u16 left_shift_bits,
u16 right_shift_bits)
static void btf_int128_shift(__u64 *print_num, __u16 left_shift_bits,
__u16 right_shift_bits)
{
__u64 upper_num, lower_num;

View File

@ -157,7 +157,7 @@ static bool cfg_partition_funcs(struct cfg *cfg, struct bpf_insn *cur,
return false;
}
static bool is_jmp_insn(u8 code)
static bool is_jmp_insn(__u8 code)
{
return BPF_CLASS(code) == BPF_JMP || BPF_CLASS(code) == BPF_JMP32;
}
@ -176,7 +176,7 @@ static bool func_partition_bb_head(struct func_node *func)
for (; cur <= end; cur++) {
if (is_jmp_insn(cur->code)) {
u8 opcode = BPF_OP(cur->code);
__u8 opcode = BPF_OP(cur->code);
if (opcode == BPF_EXIT || opcode == BPF_CALL)
continue;

88
tools/bpf/bpftool/iter.c Normal file
View File

@ -0,0 +1,88 @@
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
// Copyright (C) 2020 Facebook
#define _GNU_SOURCE
#include <linux/err.h>
#include <bpf/libbpf.h>
#include "main.h"
static int do_pin(int argc, char **argv)
{
const char *objfile, *path;
struct bpf_program *prog;
struct bpf_object *obj;
struct bpf_link *link;
int err;
if (!REQ_ARGS(2))
usage();
objfile = GET_ARG();
path = GET_ARG();
obj = bpf_object__open(objfile);
if (IS_ERR(obj)) {
p_err("can't open objfile %s", objfile);
return -1;
}
err = bpf_object__load(obj);
if (err) {
p_err("can't load objfile %s", objfile);
goto close_obj;
}
prog = bpf_program__next(NULL, obj);
if (!prog) {
p_err("can't find bpf program in objfile %s", objfile);
goto close_obj;
}
link = bpf_program__attach_iter(prog, NULL);
if (IS_ERR(link)) {
err = PTR_ERR(link);
p_err("attach_iter failed for program %s",
bpf_program__name(prog));
goto close_obj;
}
err = mount_bpffs_for_pin(path);
if (err)
goto close_link;
err = bpf_link__pin(link, path);
if (err) {
p_err("pin_iter failed for program %s to path %s",
bpf_program__name(prog), path);
goto close_link;
}
close_link:
bpf_link__destroy(link);
close_obj:
bpf_object__close(obj);
return err;
}
static int do_help(int argc, char **argv)
{
fprintf(stderr,
"Usage: %s %s pin OBJ PATH\n"
" %s %s help\n"
"\n",
bin_name, argv[-2], bin_name, argv[-2]);
return 0;
}
static const struct cmd cmds[] = {
{ "help", do_help },
{ "pin", do_pin },
{ 0 }
};
int do_iter(int argc, char **argv)
{
return cmd_select(cmds, argc, argv, do_help);
}

View File

@ -16,6 +16,7 @@ static const char * const link_type_name[] = {
[BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint",
[BPF_LINK_TYPE_TRACING] = "tracing",
[BPF_LINK_TYPE_CGROUP] = "cgroup",
[BPF_LINK_TYPE_ITER] = "iter",
};
static int link_parse_fd(int *argc, char ***argv)

View File

@ -59,7 +59,7 @@ static int do_help(int argc, char **argv)
" %s batch file FILE\n"
" %s version\n"
"\n"
" OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops }\n"
" OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, bin_name, bin_name);
@ -224,6 +224,7 @@ static const struct cmd cmds[] = {
{ "btf", do_btf },
{ "gen", do_gen },
{ "struct_ops", do_struct_ops },
{ "iter", do_iter },
{ "version", do_version },
{ 0 }
};

View File

@ -18,6 +18,9 @@
#include "json_writer.h"
/* Make sure we do not use kernel-only integer typedefs */
#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
#define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr))
#define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); })
@ -199,6 +202,7 @@ int do_feature(int argc, char **argv);
int do_btf(int argc, char **argv);
int do_gen(int argc, char **argv);
int do_struct_ops(int argc, char **argv);
int do_iter(int argc, char **argv);
int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
int prog_parse_fd(int *argc, char ***argv);

View File

@ -1589,7 +1589,8 @@ static int do_help(int argc, char **argv)
" percpu_array | stack_trace | cgroup_array | lru_hash |\n"
" lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n"
" devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
" cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n"
" cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
" queue | stack | sk_storage | struct_ops }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],

View File

@ -39,7 +39,7 @@ struct event_ring_info {
struct perf_event_sample {
struct perf_event_header header;
u64 time;
__u64 time;
__u32 size;
unsigned char data[];
};

View File

@ -238,7 +238,7 @@ exit_free:
return fd;
}
static void show_prog_maps(int fd, u32 num_maps)
static void show_prog_maps(int fd, __u32 num_maps)
{
struct bpf_prog_info info = {};
__u32 len = sizeof(info);

View File

@ -8,7 +8,8 @@ BPFTOOL ?= $(DEFAULT_BPFTOOL)
LIBBPF_SRC := $(abspath ../../lib/bpf)
BPFOBJ := $(OUTPUT)/libbpf.a
BPF_INCLUDE := $(OUTPUT)
INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib)
INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib) \
-I$(abspath ../../include/uapi)
CFLAGS := -g -Wall
# Try to detect best kernel BTF source

View File

@ -116,6 +116,7 @@ enum bpf_cmd {
BPF_LINK_GET_FD_BY_ID,
BPF_LINK_GET_NEXT_ID,
BPF_ENABLE_STATS,
BPF_ITER_CREATE,
};
enum bpf_map_type {
@ -218,6 +219,7 @@ enum bpf_attach_type {
BPF_TRACE_FEXIT,
BPF_MODIFY_RETURN,
BPF_LSM_MAC,
BPF_TRACE_ITER,
__MAX_BPF_ATTACH_TYPE
};
@ -228,6 +230,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
BPF_LINK_TYPE_TRACING = 2,
BPF_LINK_TYPE_CGROUP = 3,
BPF_LINK_TYPE_ITER = 4,
MAX_BPF_LINK_TYPE,
};
@ -612,6 +615,11 @@ union bpf_attr {
__u32 type;
} enable_stats;
struct { /* struct used by BPF_ITER_CREATE command */
__u32 link_fd;
__u32 flags;
} iter_create;
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@ -667,8 +675,8 @@ union bpf_attr {
* For tracing programs, safely attempt to read *size* bytes from
* kernel space address *unsafe_ptr* and store the data in *dst*.
*
* Generally, use bpf_probe_read_user() or bpf_probe_read_kernel()
* instead.
* Generally, use **bpf_probe_read_user**\ () or
* **bpf_probe_read_kernel**\ () instead.
* Return
* 0 on success, or a negative error in case of failure.
*
@ -676,7 +684,7 @@ union bpf_attr {
* Description
* Return the time elapsed since system boot, in nanoseconds.
* Does not include time the system was suspended.
* See: clock_gettime(CLOCK_MONOTONIC)
* See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
* Return
* Current *ktime*.
*
@ -1535,11 +1543,11 @@ union bpf_attr {
* int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
* Description
* Copy a NUL terminated string from an unsafe kernel address
* *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for
* *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
* more details.
*
* Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str()
* instead.
* Generally, use **bpf_probe_read_user_str**\ () or
* **bpf_probe_read_kernel_str**\ () instead.
* Return
* On success, the strictly positive length of the string,
* including the trailing NUL character. On error, a negative
@ -1567,7 +1575,7 @@ union bpf_attr {
*
* u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
* Description
* Equivalent to bpf_get_socket_cookie() helper that accepts
* Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
* *skb*, but gets socket from **struct bpf_sock_ops** context.
* Return
* A 8-byte long non-decreasing number.
@ -1596,6 +1604,7 @@ union bpf_attr {
* The option value of length *optlen* is pointed by *optval*.
*
* *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**.
@ -1664,12 +1673,12 @@ union bpf_attr {
*
* The lower two bits of *flags* are used as the return code if
* the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to XDP_TX, as chosen by
* the caller. Any higher bits in the *flags* argument must be
* one of the XDP program return codes up to **XDP_TX**, as chosen
* by the caller. Any higher bits in the *flags* argument must be
* unset.
*
* See also bpf_redirect(), which only supports redirecting to an
* ifindex, but doesn't require a map to do so.
* See also **bpf_redirect**\ (), which only supports redirecting
* to an ifindex, but doesn't require a map to do so.
* Return
* **XDP_REDIRECT** on success, or the value of the two lower bits
* of the *flags* argument on error.
@ -1777,7 +1786,7 @@ union bpf_attr {
* the time running for event since last normalization. The
* enabled and running times are accumulated since the perf event
* open. To achieve scaling factor between two invocations of an
* eBPF program, users can can use CPU id as the key (which is
* eBPF program, users can use CPU id as the key (which is
* typical for perf array usage model) to remember the previous
* value and do the calculation inside the eBPF program.
* Return
@ -1804,6 +1813,7 @@ union bpf_attr {
* *opval* and of length *optlen*.
*
* *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**.
@ -1825,7 +1835,7 @@ union bpf_attr {
* The first argument is the context *regs* on which the kprobe
* works.
*
* This helper works by setting setting the PC (program counter)
* This helper works by setting the PC (program counter)
* to an override function which is run in place of the original
* probed function. This means the probed function is not run at
* all. The replacement function just returns with the required
@ -1994,10 +2004,11 @@ union bpf_attr {
*
* This helper works for IPv4 and IPv6, TCP and UDP sockets. The
* domain (*addr*\ **->sa_family**) must be **AF_INET** (or
* **AF_INET6**). Looking for a free port to bind to can be
* expensive, therefore binding to port is not permitted by the
* helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
* must be set to zero.
* **AF_INET6**). It's advised to pass zero port (**sin_port**
* or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
* behavior and lets the kernel efficiently pick up an unused
* port as long as 4-tuple is unique. Passing non-zero port might
* lead to degraded performance.
* Return
* 0 on success, or a negative error in case of failure.
*
@ -2291,7 +2302,7 @@ union bpf_attr {
* **bpf_rc_keydown**\ () again with the same values, or calling
* **bpf_rc_repeat**\ ().
*
* Some protocols include a toggle bit, in case the button was
* Some protocols include a toggle bit, in case the button was
* released and pressed again between consecutive scancodes.
*
* The *ctx* should point to the lirc sample as passed into
@ -2637,7 +2648,6 @@ union bpf_attr {
*
* *th* points to the start of the TCP header, while *th_len*
* contains **sizeof**\ (**struct tcphdr**).
*
* Return
* 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
* error otherwise.
@ -2820,7 +2830,6 @@ union bpf_attr {
*
* *th* points to the start of the TCP header, while *th_len*
* contains the length of the TCP header.
*
* Return
* On success, lower 32 bits hold the generated SYN cookie in
* followed by 16 bits which hold the MSS value for that cookie,
@ -2903,7 +2912,7 @@ union bpf_attr {
* // size, after checking its boundaries.
* }
*
* In comparison, using **bpf_probe_read_user()** helper here
* In comparison, using **bpf_probe_read_user**\ () helper here
* instead to read the string would require to estimate the length
* at compile time, and would often result in copying more memory
* than necessary.
@ -2921,14 +2930,14 @@ union bpf_attr {
* int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
* Description
* Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
* to *dst*. Same semantics as with bpf_probe_read_user_str() apply.
* to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
* Return
* On success, the strictly positive length of the string, including
* On success, the strictly positive length of the string, including
* the trailing NUL character. On error, a negative value.
*
* int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
* Description
* Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock.
* Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
* *rcv_nxt* is the ack_seq to be sent out.
* Return
* 0 on success, or a negative error in case of failure.
@ -2956,19 +2965,19 @@ union bpf_attr {
* int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
* Description
* For an eBPF program attached to a perf event, retrieve the
* branch records (struct perf_branch_entry) associated to *ctx*
* and store it in the buffer pointed by *buf* up to size
* branch records (**struct perf_branch_entry**) associated to *ctx*
* and store it in the buffer pointed by *buf* up to size
* *size* bytes.
* Return
* On success, number of bytes written to *buf*. On error, a
* negative value.
*
* The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
* instead return the number of bytes required to store all the
* instead return the number of bytes required to store all the
* branch entries. If this flag is set, *buf* may be NULL.
*
* **-EINVAL** if arguments invalid or **size** not a multiple
* of sizeof(struct perf_branch_entry).
* of **sizeof**\ (**struct perf_branch_entry**\ ).
*
* **-ENOENT** if architecture does not support branch records.
*
@ -2976,8 +2985,8 @@ union bpf_attr {
* Description
* Returns 0 on success, values for *pid* and *tgid* as seen from the current
* *namespace* will be returned in *nsdata*.
*
* On failure, the returned value is one of the following:
* Return
* 0 on success, or one of the following in case of failure:
*
* **-EINVAL** if dev and inum supplied don't match dev_t and inode number
* with nsfs of current task, or if dev conversion to dev_t lost high bits.
@ -3016,8 +3025,8 @@ union bpf_attr {
* a global identifier that can be assumed unique. If *ctx* is
* NULL, then the helper returns the cookie for the initial
* network namespace. The cookie itself is very similar to that
* of bpf_get_socket_cookie() helper, but for network namespaces
* instead of sockets.
* of **bpf_get_socket_cookie**\ () helper, but for network
* namespaces instead of sockets.
* Return
* A 8-byte long opaque number.
*
@ -3052,22 +3061,98 @@ union bpf_attr {
*
* The *flags* argument must be zero.
* Return
* 0 on success, or a negative errno in case of failure.
* 0 on success, or a negative error in case of failure:
*
* * **-EINVAL** Unsupported flags specified.
* * **-ENOENT** Socket is unavailable for assignment.
* * **-ENETUNREACH** Socket is unreachable (wrong netns).
* * **-EOPNOTSUPP** Unsupported operation, for example a
* call from outside of TC ingress.
* * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
* **-EINVAL** if specified *flags* are not supported.
*
* **-ENOENT** if the socket is unavailable for assignment.
*
* **-ENETUNREACH** if the socket is unreachable (wrong netns).
*
* **-EOPNOTSUPP** if the operation is not supported, for example
* a call from outside of TC ingress.
*
* **-ESOCKTNOSUPPORT** if the socket type is not supported
* (reuseport).
*
* u64 bpf_ktime_get_boot_ns(void)
* Description
* Return the time elapsed since system boot, in nanoseconds.
* Does include the time the system was suspended.
* See: clock_gettime(CLOCK_BOOTTIME)
* See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
* Return
* Current *ktime*.
*
* int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
* Description
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
* out the format string.
* The *m* represents the seq_file. The *fmt* and *fmt_size* are for
* the format string itself. The *data* and *data_len* are format string
* arguments. The *data* are a **u64** array and corresponding format string
* values are stored in the array. For strings and pointers where pointees
* are accessed, only the pointer values are stored in the *data* array.
* The *data_len* is the size of *data* in bytes.
*
* Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
* Reading kernel memory may fail due to either invalid address or
* valid address but requiring a major memory fault. If reading kernel memory
* fails, the string for **%s** will be an empty string, and the ip
* address for **%p{i,I}{4,6}** will be 0. Not returning error to
* bpf program is consistent with what **bpf_trace_printk**\ () does for now.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EBUSY** if per-CPU memory copy buffer is busy, can try again
* by returning 1 from bpf program.
*
* **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
*
* **-E2BIG** if *fmt* contains too many format specifiers.
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
* Description
* **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
* The *m* represents the seq_file. The *data* and *len* represent the
* data to write in bytes.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
* Description
* Return the cgroup v2 id of the socket *sk*.
*
* *sk* must be a non-**NULL** pointer to a full socket, e.g. one
* returned from **bpf_sk_lookup_xxx**\ (),
* **bpf_sk_fullsock**\ (), etc. The format of returned id is
* same as in **bpf_skb_cgroup_id**\ ().
*
* This helper is available only if the kernel was compiled with
* the **CONFIG_SOCK_CGROUP_DATA** configuration option.
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
* Description
* Return id of cgroup v2 that is ancestor of cgroup associated
* with the *sk* at the *ancestor_level*. The root cgroup is at
* *ancestor_level* zero and each step down the hierarchy
* increments the level. If *ancestor_level* == level of cgroup
* associated with *sk*, then return value will be same as that
* of **bpf_sk_cgroup_id**\ ().
*
* The helper is useful to implement policies based on cgroups
* that are upper in hierarchy than immediate cgroup associated
* with *sk*.
*
* The format of returned id and helper limitations are same as in
* **bpf_sk_cgroup_id**\ ().
* Return
* The id is returned or 0 in case the id could not be retrieved.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@ -3195,7 +3280,11 @@ union bpf_attr {
FN(get_netns_cookie), \
FN(get_current_ancestor_cgroup_id), \
FN(sk_assign), \
FN(ktime_get_boot_ns),
FN(ktime_get_boot_ns), \
FN(seq_printf), \
FN(seq_write), \
FN(sk_cgroup_id), \
FN(sk_ancestor_cgroup_id),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@ -3673,7 +3762,7 @@ struct bpf_sock_addr {
__u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
* Stored in network byte order.
*/
__u32 user_port; /* Allows 4-byte read and write.
__u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.
* Stored in network byte order
*/
__u32 family; /* Allows 4-byte read, but no write */

View File

@ -619,6 +619,16 @@ int bpf_link_update(int link_fd, int new_prog_fd,
return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr));
}
int bpf_iter_create(int link_fd)
{
union bpf_attr attr;
memset(&attr, 0, sizeof(attr));
attr.iter_create.link_fd = link_fd;
return sys_bpf(BPF_ITER_CREATE, &attr, sizeof(attr));
}
int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
__u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt)
{

View File

@ -187,6 +187,8 @@ struct bpf_link_update_opts {
LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd,
const struct bpf_link_update_opts *opts);
LIBBPF_API int bpf_iter_create(int link_fd);
struct bpf_prog_test_run_attr {
int prog_fd;
int repeat;

View File

@ -36,6 +36,20 @@
#define __weak __attribute__((weak))
#endif
/*
* Helper macro to manipulate data structures
*/
#ifndef offsetof
#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
#endif
#ifndef container_of
#define container_of(ptr, type, member) \
({ \
void *__mptr = (void *)(ptr); \
((type *)(__mptr - offsetof(type, member))); \
})
#endif
/*
* Helper structure used by eBPF C program
* to describe BPF map attributes to libbpf loader

View File

@ -413,4 +413,20 @@ typeof(name(0)) name(struct pt_regs *ctx) \
} \
static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
/*
* BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values
* in a structure.
*/
#define BPF_SEQ_PRINTF(seq, fmt, args...) \
({ \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
static const char ___fmt[] = fmt; \
unsigned long long ___param[] = { args }; \
_Pragma("GCC diagnostic pop") \
int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \
___param, sizeof(___param)); \
___ret; \
})
#endif

View File

@ -3237,7 +3237,7 @@ int bpf_map__resize(struct bpf_map *map, __u32 max_entries)
}
static int
bpf_object__probe_name(struct bpf_object *obj)
bpf_object__probe_loading(struct bpf_object *obj)
{
struct bpf_load_program_attr attr;
char *cp, errmsg[STRERR_BUFSIZE];
@ -3257,15 +3257,36 @@ bpf_object__probe_name(struct bpf_object *obj)
ret = bpf_load_program_xattr(&attr, NULL, 0);
if (ret < 0) {
cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
pr_warn("Error in %s():%s(%d). Couldn't load basic 'r0 = 0' BPF program.\n",
__func__, cp, errno);
return -errno;
ret = errno;
cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
pr_warn("Error in %s():%s(%d). Couldn't load trivial BPF "
"program. Make sure your kernel supports BPF "
"(CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is "
"set to big enough value.\n", __func__, cp, ret);
return -ret;
}
close(ret);
/* now try the same program, but with the name */
return 0;
}
static int
bpf_object__probe_name(struct bpf_object *obj)
{
struct bpf_load_program_attr attr;
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
int ret;
/* make sure loading with name works */
memset(&attr, 0, sizeof(attr));
attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
attr.insns = insns;
attr.insns_cnt = ARRAY_SIZE(insns);
attr.license = "GPL";
attr.name = "test";
ret = bpf_load_program_xattr(&attr, NULL, 0);
if (ret >= 0) {
@ -5636,7 +5657,8 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
obj->loaded = true;
err = bpf_object__probe_caps(obj);
err = bpf_object__probe_loading(obj);
err = err ? : bpf_object__probe_caps(obj);
err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
err = err ? : bpf_object__sanitize_and_load_btf(obj);
err = err ? : bpf_object__sanitize_maps(obj);
@ -6586,6 +6608,8 @@ static struct bpf_link *attach_trace(const struct bpf_sec_def *sec,
struct bpf_program *prog);
static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec,
struct bpf_program *prog);
static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
struct bpf_program *prog);
static const struct bpf_sec_def section_defs[] = {
BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER),
@ -6629,6 +6653,10 @@ static const struct bpf_sec_def section_defs[] = {
.is_attach_btf = true,
.expected_attach_type = BPF_LSM_MAC,
.attach_fn = attach_lsm),
SEC_DEF("iter/", TRACING,
.expected_attach_type = BPF_TRACE_ITER,
.is_attach_btf = true,
.attach_fn = attach_iter),
BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP),
BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT),
BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN),
@ -6891,6 +6919,7 @@ invalid_prog:
#define BTF_TRACE_PREFIX "btf_trace_"
#define BTF_LSM_PREFIX "bpf_lsm_"
#define BTF_ITER_PREFIX "bpf_iter_"
#define BTF_MAX_NAME_SIZE 128
static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix,
@ -6921,6 +6950,9 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name,
else if (attach_type == BPF_LSM_MAC)
err = find_btf_by_prefix_kind(btf, BTF_LSM_PREFIX, name,
BTF_KIND_FUNC);
else if (attach_type == BPF_TRACE_ITER)
err = find_btf_by_prefix_kind(btf, BTF_ITER_PREFIX, name,
BTF_KIND_FUNC);
else
err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC);
@ -7848,6 +7880,12 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec,
return bpf_program__attach_lsm(prog);
}
static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
struct bpf_program *prog)
{
return bpf_program__attach_iter(prog, NULL);
}
struct bpf_link *
bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
{
@ -7882,6 +7920,42 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
return link;
}
struct bpf_link *
bpf_program__attach_iter(struct bpf_program *prog,
const struct bpf_iter_attach_opts *opts)
{
char errmsg[STRERR_BUFSIZE];
struct bpf_link *link;
int prog_fd, link_fd;
if (!OPTS_VALID(opts, bpf_iter_attach_opts))
return ERR_PTR(-EINVAL);
prog_fd = bpf_program__fd(prog);
if (prog_fd < 0) {
pr_warn("program '%s': can't attach before loaded\n",
bpf_program__title(prog, false));
return ERR_PTR(-EINVAL);
}
link = calloc(1, sizeof(*link));
if (!link)
return ERR_PTR(-ENOMEM);
link->detach = &bpf_link__detach_fd;
link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_ITER, NULL);
if (link_fd < 0) {
link_fd = -errno;
free(link);
pr_warn("program '%s': failed to attach to iterator: %s\n",
bpf_program__title(prog, false),
libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg)));
return ERR_PTR(link_fd);
}
link->fd = link_fd;
return link;
}
struct bpf_link *bpf_program__attach(struct bpf_program *prog)
{
const struct bpf_sec_def *sec_def;
@ -8300,7 +8374,7 @@ error:
struct perf_sample_raw {
struct perf_event_header header;
uint32_t size;
char data[0];
char data[];
};
struct perf_sample_lost {

View File

@ -258,6 +258,15 @@ struct bpf_map;
LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map);
struct bpf_iter_attach_opts {
size_t sz; /* size of this struct for forward/backward compatibility */
};
#define bpf_iter_attach_opts__last_field sz
LIBBPF_API struct bpf_link *
bpf_program__attach_iter(struct bpf_program *prog,
const struct bpf_iter_attach_opts *opts);
struct bpf_insn;
/*

View File

@ -258,6 +258,8 @@ LIBBPF_0.0.8 {
LIBBPF_0.0.9 {
global:
bpf_enable_stats;
bpf_iter_create;
bpf_link_get_fd_by_id;
bpf_link_get_next_id;
bpf_program__attach_iter;
} LIBBPF_0.0.8;

View File

@ -153,7 +153,7 @@ struct btf_ext_info_sec {
__u32 sec_name_off;
__u32 num_info;
/* Followed by num_info * record_size number of bytes */
__u8 data[0];
__u8 data[];
};
/* The minimum bpf_func_info checked by the loader */

View File

@ -686,8 +686,11 @@ try_again_reset:
break;
}
}
if (child_pid != -1)
if (child_pid != -1) {
if (timeout)
kill(child_pid, SIGTERM);
wait4(child_pid, &status, 0, &stat_config.ru_data);
}
if (workload_exec_errno) {
const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));

View File

@ -1821,6 +1821,24 @@ static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused,
}
#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT)
static int
symbol__disassemble_bpf_image(struct symbol *sym,
struct annotate_args *args)
{
struct annotation *notes = symbol__annotation(sym);
struct disasm_line *dl;
args->offset = -1;
args->line = strdup("to be implemented");
args->line_nr = 0;
dl = disasm_line__new(args);
if (dl)
annotation_line__add(&dl->al, &notes->src->source);
free(args->line);
return 0;
}
/*
* Possibly create a new version of line with tabs expanded. Returns the
* existing or new line, storage is updated if a new line is allocated. If
@ -1920,6 +1938,8 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) {
return symbol__disassemble_bpf(sym, args);
} else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) {
return symbol__disassemble_bpf_image(sym, args);
} else if (dso__is_kcore(dso)) {
kce.kcore_filename = symfs_filename;
kce.addr = map__rip_2objdump(map, sym->start);

View File

@ -6,6 +6,9 @@
#include <bpf/libbpf.h>
#include <linux/btf.h>
#include <linux/err.h>
#include <linux/string.h>
#include <internal/lib.h>
#include <symbol/kallsyms.h>
#include "bpf-event.h"
#include "debug.h"
#include "dso.h"
@ -290,11 +293,82 @@ out:
return err ? -1 : 0;
}
struct kallsyms_parse {
union perf_event *event;
perf_event__handler_t process;
struct machine *machine;
struct perf_tool *tool;
};
static int
process_bpf_image(char *name, u64 addr, struct kallsyms_parse *data)
{
struct machine *machine = data->machine;
union perf_event *event = data->event;
struct perf_record_ksymbol *ksymbol;
int len;
ksymbol = &event->ksymbol;
*ksymbol = (struct perf_record_ksymbol) {
.header = {
.type = PERF_RECORD_KSYMBOL,
.size = offsetof(struct perf_record_ksymbol, name),
},
.addr = addr,
.len = page_size,
.ksym_type = PERF_RECORD_KSYMBOL_TYPE_BPF,
.flags = 0,
};
len = scnprintf(ksymbol->name, KSYM_NAME_LEN, "%s", name);
ksymbol->header.size += PERF_ALIGN(len + 1, sizeof(u64));
memset((void *) event + event->header.size, 0, machine->id_hdr_size);
event->header.size += machine->id_hdr_size;
return perf_tool__process_synth_event(data->tool, event, machine,
data->process);
}
static int
kallsyms_process_symbol(void *data, const char *_name,
char type __maybe_unused, u64 start)
{
char disp[KSYM_NAME_LEN];
char *module, *name;
unsigned long id;
int err = 0;
module = strchr(_name, '\t');
if (!module)
return 0;
/* We are going after [bpf] module ... */
if (strcmp(module + 1, "[bpf]"))
return 0;
name = memdup(_name, (module - _name) + 1);
if (!name)
return -ENOMEM;
name[module - _name] = 0;
/* .. and only for trampolines and dispatchers */
if ((sscanf(name, "bpf_trampoline_%lu", &id) == 1) ||
(sscanf(name, "bpf_dispatcher_%s", disp) == 1))
err = process_bpf_image(name, start, data);
free(name);
return err;
}
int perf_event__synthesize_bpf_events(struct perf_session *session,
perf_event__handler_t process,
struct machine *machine,
struct record_opts *opts)
{
const char *kallsyms_filename = "/proc/kallsyms";
struct kallsyms_parse arg;
union perf_event *event;
__u32 id = 0;
int err;
@ -303,6 +377,8 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
event = malloc(sizeof(event->bpf) + KSYM_NAME_LEN + machine->id_hdr_size);
if (!event)
return -1;
/* Synthesize all the bpf programs in system. */
while (true) {
err = bpf_prog_get_next_id(id, &id);
if (err) {
@ -335,6 +411,23 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
break;
}
}
/* Synthesize all the bpf images - trampolines/dispatchers. */
if (symbol_conf.kallsyms_name != NULL)
kallsyms_filename = symbol_conf.kallsyms_name;
arg = (struct kallsyms_parse) {
.event = event,
.process = process,
.machine = machine,
.tool = session->tool,
};
if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol)) {
pr_err("%s: failed to synthesize bpf images: %s\n",
__func__, strerror(errno));
}
free(event);
return err;
}

View File

@ -191,6 +191,7 @@ int dso__read_binary_type_filename(const struct dso *dso,
case DSO_BINARY_TYPE__GUEST_KALLSYMS:
case DSO_BINARY_TYPE__JAVA_JIT:
case DSO_BINARY_TYPE__BPF_PROG_INFO:
case DSO_BINARY_TYPE__BPF_IMAGE:
case DSO_BINARY_TYPE__NOT_FOUND:
ret = -1;
break;

View File

@ -40,6 +40,7 @@ enum dso_binary_type {
DSO_BINARY_TYPE__GUEST_KCORE,
DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO,
DSO_BINARY_TYPE__BPF_PROG_INFO,
DSO_BINARY_TYPE__BPF_IMAGE,
DSO_BINARY_TYPE__NOT_FOUND,
};

View File

@ -736,6 +736,12 @@ int machine__process_switch_event(struct machine *machine __maybe_unused,
return 0;
}
static int is_bpf_image(const char *name)
{
return strncmp(name, "bpf_trampoline_", sizeof("bpf_trampoline_") - 1) ||
strncmp(name, "bpf_dispatcher_", sizeof("bpf_dispatcher_") - 1);
}
static int machine__process_ksymbol_register(struct machine *machine,
union perf_event *event,
struct perf_sample *sample __maybe_unused)
@ -759,6 +765,12 @@ static int machine__process_ksymbol_register(struct machine *machine,
map->start = event->ksymbol.addr;
map->end = map->start + event->ksymbol.len;
maps__insert(&machine->kmaps, map);
dso__set_loaded(dso);
if (is_bpf_image(event->ksymbol.name)) {
dso->binary_type = DSO_BINARY_TYPE__BPF_IMAGE;
dso__set_long_name(dso, "", false);
}
}
sym = symbol__new(map->map_ip(map, map->start),

View File

@ -1544,6 +1544,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod,
return true;
case DSO_BINARY_TYPE__BPF_PROG_INFO:
case DSO_BINARY_TYPE__BPF_IMAGE:
case DSO_BINARY_TYPE__NOT_FOUND:
default:
return false;

View File

@ -38,3 +38,4 @@ test_cpp
/bpf_gcc
/tools
/runqslower
/bench

View File

@ -77,7 +77,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
# Compile but not part of 'make run_tests'
TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
test_lirc_mode2_user xdping test_cpp runqslower
test_lirc_mode2_user xdping test_cpp runqslower bench
TEST_CUSTOM_PROGS = urandom_read
@ -265,6 +265,7 @@ TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS)
TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \
$$(filter-out $(SKEL_BLACKLIST), \
$$(TRUNNER_BPF_SRCS)))
TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS)
# Evaluate rules now with extra TRUNNER_XXX variables above already defined
$$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2))
@ -354,6 +355,7 @@ endef
TRUNNER_TESTS_DIR := prog_tests
TRUNNER_BPF_PROGS_DIR := progs
TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \
network_helpers.c testing_helpers.c \
flow_dissector_load.h
TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \
$(wildcard progs/btf_dump_test_case_*.c)
@ -405,6 +407,21 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
$(call msg,CXX,,$@)
$(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@
# Benchmark runner
$(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h
$(call msg,CC,,$@)
$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
$(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h
$(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \
$(OUTPUT)/bench_count.o \
$(OUTPUT)/bench_rename.o \
$(OUTPUT)/bench_trigger.o
$(call msg,BINARY,,$@)
$(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS)
EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \
prog_tests/tests.h map_tests/tests.h verifier/tests.h \
feature \

View File

@ -0,0 +1,43 @@
==================
BPF Selftest Notes
==================
Additional information about selftest failures are
documented here.
bpf_iter test failures with clang/llvm 10.0.0
=============================================
With clang/llvm 10.0.0, the following two bpf_iter tests failed:
* ``bpf_iter/ipv6_route``
* ``bpf_iter/netlink``
The symptom for ``bpf_iter/ipv6_route`` looks like
.. code-block:: c
2: (79) r8 = *(u64 *)(r1 +8)
...
14: (bf) r2 = r8
15: (0f) r2 += r1
; BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
16: (7b) *(u64 *)(r8 +64) = r2
only read is supported
The symptom for ``bpf_iter/netlink`` looks like
.. code-block:: c
; struct netlink_sock *nlk = ctx->sk;
2: (79) r7 = *(u64 *)(r1 +8)
...
15: (bf) r2 = r7
16: (0f) r2 += r1
; BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol);
17: (7b) *(u64 *)(r7 +0) = r2
only read is supported
This is due to a llvm BPF backend bug. The fix
https://reviews.llvm.org/D78466
has been pushed to llvm 10.x release branch and will be
available in 10.0.1. The fix is available in llvm 11.0.0 trunk.

View File

@ -0,0 +1,449 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#define _GNU_SOURCE
#include <argp.h>
#include <linux/compiler.h>
#include <sys/time.h>
#include <sched.h>
#include <fcntl.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <sys/resource.h>
#include <signal.h>
#include "bench.h"
#include "testing_helpers.h"
struct env env = {
.warmup_sec = 1,
.duration_sec = 5,
.affinity = false,
.consumer_cnt = 1,
.producer_cnt = 1,
};
static int libbpf_print_fn(enum libbpf_print_level level,
const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !env.verbose)
return 0;
return vfprintf(stderr, format, args);
}
static int bump_memlock_rlimit(void)
{
struct rlimit rlim_new = {
.rlim_cur = RLIM_INFINITY,
.rlim_max = RLIM_INFINITY,
};
return setrlimit(RLIMIT_MEMLOCK, &rlim_new);
}
void setup_libbpf()
{
int err;
libbpf_set_print(libbpf_print_fn);
err = bump_memlock_rlimit();
if (err)
fprintf(stderr, "failed to increase RLIMIT_MEMLOCK: %d", err);
}
void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns)
{
double hits_per_sec, drops_per_sec;
double hits_per_prod;
hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0);
hits_per_prod = hits_per_sec / env.producer_cnt;
drops_per_sec = res->drops / 1000000.0 / (delta_ns / 1000000000.0);
printf("Iter %3d (%7.3lfus): ",
iter, (delta_ns - 1000000000) / 1000.0);
printf("hits %8.3lfM/s (%7.3lfM/prod), drops %8.3lfM/s\n",
hits_per_sec, hits_per_prod, drops_per_sec);
}
void hits_drops_report_final(struct bench_res res[], int res_cnt)
{
int i;
double hits_mean = 0.0, drops_mean = 0.0;
double hits_stddev = 0.0, drops_stddev = 0.0;
for (i = 0; i < res_cnt; i++) {
hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt);
drops_mean += res[i].drops / 1000000.0 / (0.0 + res_cnt);
}
if (res_cnt > 1) {
for (i = 0; i < res_cnt; i++) {
hits_stddev += (hits_mean - res[i].hits / 1000000.0) *
(hits_mean - res[i].hits / 1000000.0) /
(res_cnt - 1.0);
drops_stddev += (drops_mean - res[i].drops / 1000000.0) *
(drops_mean - res[i].drops / 1000000.0) /
(res_cnt - 1.0);
}
hits_stddev = sqrt(hits_stddev);
drops_stddev = sqrt(drops_stddev);
}
printf("Summary: hits %8.3lf \u00B1 %5.3lfM/s (%7.3lfM/prod), ",
hits_mean, hits_stddev, hits_mean / env.producer_cnt);
printf("drops %8.3lf \u00B1 %5.3lfM/s\n",
drops_mean, drops_stddev);
}
const char *argp_program_version = "benchmark";
const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
const char argp_program_doc[] =
"benchmark Generic benchmarking framework.\n"
"\n"
"This tool runs benchmarks.\n"
"\n"
"USAGE: benchmark <bench-name>\n"
"\n"
"EXAMPLES:\n"
" # run 'count-local' benchmark with 1 producer and 1 consumer\n"
" benchmark count-local\n"
" # run 'count-local' with 16 producer and 8 consumer thread, pinned to CPUs\n"
" benchmark -p16 -c8 -a count-local\n";
enum {
ARG_PROD_AFFINITY_SET = 1000,
ARG_CONS_AFFINITY_SET = 1001,
};
static const struct argp_option opts[] = {
{ "list", 'l', NULL, 0, "List available benchmarks"},
{ "duration", 'd', "SEC", 0, "Duration of benchmark, seconds"},
{ "warmup", 'w', "SEC", 0, "Warm-up period, seconds"},
{ "producers", 'p', "NUM", 0, "Number of producer threads"},
{ "consumers", 'c', "NUM", 0, "Number of consumer threads"},
{ "verbose", 'v', NULL, 0, "Verbose debug output"},
{ "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"},
{ "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0,
"Set of CPUs for producer threads; implies --affinity"},
{ "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0,
"Set of CPUs for consumer threads; implies --affinity"},
{},
};
static error_t parse_arg(int key, char *arg, struct argp_state *state)
{
static int pos_args;
switch (key) {
case 'v':
env.verbose = true;
break;
case 'l':
env.list = true;
break;
case 'd':
env.duration_sec = strtol(arg, NULL, 10);
if (env.duration_sec <= 0) {
fprintf(stderr, "Invalid duration: %s\n", arg);
argp_usage(state);
}
break;
case 'w':
env.warmup_sec = strtol(arg, NULL, 10);
if (env.warmup_sec <= 0) {
fprintf(stderr, "Invalid warm-up duration: %s\n", arg);
argp_usage(state);
}
break;
case 'p':
env.producer_cnt = strtol(arg, NULL, 10);
if (env.producer_cnt <= 0) {
fprintf(stderr, "Invalid producer count: %s\n", arg);
argp_usage(state);
}
break;
case 'c':
env.consumer_cnt = strtol(arg, NULL, 10);
if (env.consumer_cnt <= 0) {
fprintf(stderr, "Invalid consumer count: %s\n", arg);
argp_usage(state);
}
break;
case 'a':
env.affinity = true;
break;
case ARG_PROD_AFFINITY_SET:
env.affinity = true;
if (parse_num_list(arg, &env.prod_cpus.cpus,
&env.prod_cpus.cpus_len)) {
fprintf(stderr, "Invalid format of CPU set for producers.");
argp_usage(state);
}
break;
case ARG_CONS_AFFINITY_SET:
env.affinity = true;
if (parse_num_list(arg, &env.cons_cpus.cpus,
&env.cons_cpus.cpus_len)) {
fprintf(stderr, "Invalid format of CPU set for consumers.");
argp_usage(state);
}
break;
case ARGP_KEY_ARG:
if (pos_args++) {
fprintf(stderr,
"Unrecognized positional argument: %s\n", arg);
argp_usage(state);
}
env.bench_name = strdup(arg);
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
static void parse_cmdline_args(int argc, char **argv)
{
static const struct argp argp = {
.options = opts,
.parser = parse_arg,
.doc = argp_program_doc,
};
if (argp_parse(&argp, argc, argv, 0, NULL, NULL))
exit(1);
if (!env.list && !env.bench_name) {
argp_help(&argp, stderr, ARGP_HELP_DOC, "bench");
exit(1);
}
}
static void collect_measurements(long delta_ns);
static __u64 last_time_ns;
static void sigalarm_handler(int signo)
{
long new_time_ns = get_time_ns();
long delta_ns = new_time_ns - last_time_ns;
collect_measurements(delta_ns);
last_time_ns = new_time_ns;
}
/* set up periodic 1-second timer */
static void setup_timer()
{
static struct sigaction sigalarm_action = {
.sa_handler = sigalarm_handler,
};
struct itimerval timer_settings = {};
int err;
last_time_ns = get_time_ns();
err = sigaction(SIGALRM, &sigalarm_action, NULL);
if (err < 0) {
fprintf(stderr, "failed to install SIGALRM handler: %d\n", -errno);
exit(1);
}
timer_settings.it_interval.tv_sec = 1;
timer_settings.it_value.tv_sec = 1;
err = setitimer(ITIMER_REAL, &timer_settings, NULL);
if (err < 0) {
fprintf(stderr, "failed to arm interval timer: %d\n", -errno);
exit(1);
}
}
static void set_thread_affinity(pthread_t thread, int cpu)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu, &cpuset);
if (pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset)) {
fprintf(stderr, "setting affinity to CPU #%d failed: %d\n",
cpu, errno);
exit(1);
}
}
static int next_cpu(struct cpu_set *cpu_set)
{
if (cpu_set->cpus) {
int i;
/* find next available CPU */
for (i = cpu_set->next_cpu; i < cpu_set->cpus_len; i++) {
if (cpu_set->cpus[i]) {
cpu_set->next_cpu = i + 1;
return i;
}
}
fprintf(stderr, "Not enough CPUs specified, need CPU #%d or higher.\n", i);
exit(1);
}
return cpu_set->next_cpu++;
}
static struct bench_state {
int res_cnt;
struct bench_res *results;
pthread_t *consumers;
pthread_t *producers;
} state;
const struct bench *bench = NULL;
extern const struct bench bench_count_global;
extern const struct bench bench_count_local;
extern const struct bench bench_rename_base;
extern const struct bench bench_rename_kprobe;
extern const struct bench bench_rename_kretprobe;
extern const struct bench bench_rename_rawtp;
extern const struct bench bench_rename_fentry;
extern const struct bench bench_rename_fexit;
extern const struct bench bench_rename_fmodret;
extern const struct bench bench_trig_base;
extern const struct bench bench_trig_tp;
extern const struct bench bench_trig_rawtp;
extern const struct bench bench_trig_kprobe;
extern const struct bench bench_trig_fentry;
extern const struct bench bench_trig_fmodret;
static const struct bench *benchs[] = {
&bench_count_global,
&bench_count_local,
&bench_rename_base,
&bench_rename_kprobe,
&bench_rename_kretprobe,
&bench_rename_rawtp,
&bench_rename_fentry,
&bench_rename_fexit,
&bench_rename_fmodret,
&bench_trig_base,
&bench_trig_tp,
&bench_trig_rawtp,
&bench_trig_kprobe,
&bench_trig_fentry,
&bench_trig_fmodret,
};
static void setup_benchmark()
{
int i, err;
if (!env.bench_name) {
fprintf(stderr, "benchmark name is not specified\n");
exit(1);
}
for (i = 0; i < ARRAY_SIZE(benchs); i++) {
if (strcmp(benchs[i]->name, env.bench_name) == 0) {
bench = benchs[i];
break;
}
}
if (!bench) {
fprintf(stderr, "benchmark '%s' not found\n", env.bench_name);
exit(1);
}
printf("Setting up benchmark '%s'...\n", bench->name);
state.producers = calloc(env.producer_cnt, sizeof(*state.producers));
state.consumers = calloc(env.consumer_cnt, sizeof(*state.consumers));
state.results = calloc(env.duration_sec + env.warmup_sec + 2,
sizeof(*state.results));
if (!state.producers || !state.consumers || !state.results)
exit(1);
if (bench->validate)
bench->validate();
if (bench->setup)
bench->setup();
for (i = 0; i < env.consumer_cnt; i++) {
err = pthread_create(&state.consumers[i], NULL,
bench->consumer_thread, (void *)(long)i);
if (err) {
fprintf(stderr, "failed to create consumer thread #%d: %d\n",
i, -errno);
exit(1);
}
if (env.affinity)
set_thread_affinity(state.consumers[i],
next_cpu(&env.cons_cpus));
}
/* unless explicit producer CPU list is specified, continue after
* last consumer CPU
*/
if (!env.prod_cpus.cpus)
env.prod_cpus.next_cpu = env.cons_cpus.next_cpu;
for (i = 0; i < env.producer_cnt; i++) {
err = pthread_create(&state.producers[i], NULL,
bench->producer_thread, (void *)(long)i);
if (err) {
fprintf(stderr, "failed to create producer thread #%d: %d\n",
i, -errno);
exit(1);
}
if (env.affinity)
set_thread_affinity(state.producers[i],
next_cpu(&env.prod_cpus));
}
printf("Benchmark '%s' started.\n", bench->name);
}
static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t bench_done = PTHREAD_COND_INITIALIZER;
static void collect_measurements(long delta_ns) {
int iter = state.res_cnt++;
struct bench_res *res = &state.results[iter];
bench->measure(res);
if (bench->report_progress)
bench->report_progress(iter, res, delta_ns);
if (iter == env.duration_sec + env.warmup_sec) {
pthread_mutex_lock(&bench_done_mtx);
pthread_cond_signal(&bench_done);
pthread_mutex_unlock(&bench_done_mtx);
}
}
int main(int argc, char **argv)
{
parse_cmdline_args(argc, argv);
if (env.list) {
int i;
printf("Available benchmarks:\n");
for (i = 0; i < ARRAY_SIZE(benchs); i++) {
printf("- %s\n", benchs[i]->name);
}
return 0;
}
setup_benchmark();
setup_timer();
pthread_mutex_lock(&bench_done_mtx);
pthread_cond_wait(&bench_done, &bench_done_mtx);
pthread_mutex_unlock(&bench_done_mtx);
if (bench->report_final)
/* skip first sample */
bench->report_final(state.results + env.warmup_sec,
state.res_cnt - env.warmup_sec);
return 0;
}

View File

@ -0,0 +1,81 @@
/* SPDX-License-Identifier: GPL-2.0 */
#pragma once
#include <stdlib.h>
#include <stdbool.h>
#include <linux/err.h>
#include <errno.h>
#include <unistd.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <math.h>
#include <time.h>
#include <sys/syscall.h>
struct cpu_set {
bool *cpus;
int cpus_len;
int next_cpu;
};
struct env {
char *bench_name;
int duration_sec;
int warmup_sec;
bool verbose;
bool list;
bool affinity;
int consumer_cnt;
int producer_cnt;
struct cpu_set prod_cpus;
struct cpu_set cons_cpus;
};
struct bench_res {
long hits;
long drops;
};
struct bench {
const char *name;
void (*validate)();
void (*setup)();
void *(*producer_thread)(void *ctx);
void *(*consumer_thread)(void *ctx);
void (*measure)(struct bench_res* res);
void (*report_progress)(int iter, struct bench_res* res, long delta_ns);
void (*report_final)(struct bench_res res[], int res_cnt);
};
struct counter {
long value;
} __attribute__((aligned(128)));
extern struct env env;
extern const struct bench *bench;
void setup_libbpf();
void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
void hits_drops_report_final(struct bench_res res[], int res_cnt);
static inline __u64 get_time_ns() {
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
return (u64)t.tv_sec * 1000000000 + t.tv_nsec;
}
static inline void atomic_inc(long *value)
{
(void)__atomic_add_fetch(value, 1, __ATOMIC_RELAXED);
}
static inline void atomic_add(long *value, long n)
{
(void)__atomic_add_fetch(value, n, __ATOMIC_RELAXED);
}
static inline long atomic_swap(long *value, long n)
{
return __atomic_exchange_n(value, n, __ATOMIC_RELAXED);
}

View File

@ -0,0 +1,91 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include "bench.h"
/* COUNT-GLOBAL benchmark */
static struct count_global_ctx {
struct counter hits;
} count_global_ctx;
static void *count_global_producer(void *input)
{
struct count_global_ctx *ctx = &count_global_ctx;
while (true) {
atomic_inc(&ctx->hits.value);
}
return NULL;
}
static void *count_global_consumer(void *input)
{
return NULL;
}
static void count_global_measure(struct bench_res *res)
{
struct count_global_ctx *ctx = &count_global_ctx;
res->hits = atomic_swap(&ctx->hits.value, 0);
}
/* COUNT-local benchmark */
static struct count_local_ctx {
struct counter *hits;
} count_local_ctx;
static void count_local_setup()
{
struct count_local_ctx *ctx = &count_local_ctx;
ctx->hits = calloc(env.consumer_cnt, sizeof(*ctx->hits));
if (!ctx->hits)
exit(1);
}
static void *count_local_producer(void *input)
{
struct count_local_ctx *ctx = &count_local_ctx;
int idx = (long)input;
while (true) {
atomic_inc(&ctx->hits[idx].value);
}
return NULL;
}
static void *count_local_consumer(void *input)
{
return NULL;
}
static void count_local_measure(struct bench_res *res)
{
struct count_local_ctx *ctx = &count_local_ctx;
int i;
for (i = 0; i < env.producer_cnt; i++) {
res->hits += atomic_swap(&ctx->hits[i].value, 0);
}
}
const struct bench bench_count_global = {
.name = "count-global",
.producer_thread = count_global_producer,
.consumer_thread = count_global_consumer,
.measure = count_global_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_count_local = {
.name = "count-local",
.setup = count_local_setup,
.producer_thread = count_local_producer,
.consumer_thread = count_local_consumer,
.measure = count_local_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};

View File

@ -0,0 +1,195 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include <fcntl.h>
#include "bench.h"
#include "test_overhead.skel.h"
/* BPF triggering benchmarks */
static struct ctx {
struct test_overhead *skel;
struct counter hits;
int fd;
} ctx;
static void validate()
{
if (env.producer_cnt != 1) {
fprintf(stderr, "benchmark doesn't support multi-producer!\n");
exit(1);
}
if (env.consumer_cnt != 1) {
fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
exit(1);
}
}
static void *producer(void *input)
{
char buf[] = "test_overhead";
int err;
while (true) {
err = write(ctx.fd, buf, sizeof(buf));
if (err < 0) {
fprintf(stderr, "write failed\n");
exit(1);
}
atomic_inc(&ctx.hits.value);
}
}
static void measure(struct bench_res *res)
{
res->hits = atomic_swap(&ctx.hits.value, 0);
}
static void setup_ctx()
{
setup_libbpf();
ctx.skel = test_overhead__open_and_load();
if (!ctx.skel) {
fprintf(stderr, "failed to open skeleton\n");
exit(1);
}
ctx.fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
if (ctx.fd < 0) {
fprintf(stderr, "failed to open /proc/self/comm: %d\n", -errno);
exit(1);
}
}
static void attach_bpf(struct bpf_program *prog)
{
struct bpf_link *link;
link = bpf_program__attach(prog);
if (IS_ERR(link)) {
fprintf(stderr, "failed to attach program!\n");
exit(1);
}
}
static void setup_base()
{
setup_ctx();
}
static void setup_kprobe()
{
setup_ctx();
attach_bpf(ctx.skel->progs.prog1);
}
static void setup_kretprobe()
{
setup_ctx();
attach_bpf(ctx.skel->progs.prog2);
}
static void setup_rawtp()
{
setup_ctx();
attach_bpf(ctx.skel->progs.prog3);
}
static void setup_fentry()
{
setup_ctx();
attach_bpf(ctx.skel->progs.prog4);
}
static void setup_fexit()
{
setup_ctx();
attach_bpf(ctx.skel->progs.prog5);
}
static void setup_fmodret()
{
setup_ctx();
attach_bpf(ctx.skel->progs.prog6);
}
static void *consumer(void *input)
{
return NULL;
}
const struct bench bench_rename_base = {
.name = "rename-base",
.validate = validate,
.setup = setup_base,
.producer_thread = producer,
.consumer_thread = consumer,
.measure = measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_rename_kprobe = {
.name = "rename-kprobe",
.validate = validate,
.setup = setup_kprobe,
.producer_thread = producer,
.consumer_thread = consumer,
.measure = measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_rename_kretprobe = {
.name = "rename-kretprobe",
.validate = validate,
.setup = setup_kretprobe,
.producer_thread = producer,
.consumer_thread = consumer,
.measure = measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_rename_rawtp = {
.name = "rename-rawtp",
.validate = validate,
.setup = setup_rawtp,
.producer_thread = producer,
.consumer_thread = consumer,
.measure = measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_rename_fentry = {
.name = "rename-fentry",
.validate = validate,
.setup = setup_fentry,
.producer_thread = producer,
.consumer_thread = consumer,
.measure = measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_rename_fexit = {
.name = "rename-fexit",
.validate = validate,
.setup = setup_fexit,
.producer_thread = producer,
.consumer_thread = consumer,
.measure = measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_rename_fmodret = {
.name = "rename-fmodret",
.validate = validate,
.setup = setup_fmodret,
.producer_thread = producer,
.consumer_thread = consumer,
.measure = measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};

View File

@ -0,0 +1,167 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include "bench.h"
#include "trigger_bench.skel.h"
/* BPF triggering benchmarks */
static struct trigger_ctx {
struct trigger_bench *skel;
} ctx;
static struct counter base_hits;
static void trigger_validate()
{
if (env.consumer_cnt != 1) {
fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
exit(1);
}
}
static void *trigger_base_producer(void *input)
{
while (true) {
(void)syscall(__NR_getpgid);
atomic_inc(&base_hits.value);
}
return NULL;
}
static void trigger_base_measure(struct bench_res *res)
{
res->hits = atomic_swap(&base_hits.value, 0);
}
static void *trigger_producer(void *input)
{
while (true)
(void)syscall(__NR_getpgid);
return NULL;
}
static void trigger_measure(struct bench_res *res)
{
res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
}
static void setup_ctx()
{
setup_libbpf();
ctx.skel = trigger_bench__open_and_load();
if (!ctx.skel) {
fprintf(stderr, "failed to open skeleton\n");
exit(1);
}
}
static void attach_bpf(struct bpf_program *prog)
{
struct bpf_link *link;
link = bpf_program__attach(prog);
if (IS_ERR(link)) {
fprintf(stderr, "failed to attach program!\n");
exit(1);
}
}
static void trigger_tp_setup()
{
setup_ctx();
attach_bpf(ctx.skel->progs.bench_trigger_tp);
}
static void trigger_rawtp_setup()
{
setup_ctx();
attach_bpf(ctx.skel->progs.bench_trigger_raw_tp);
}
static void trigger_kprobe_setup()
{
setup_ctx();
attach_bpf(ctx.skel->progs.bench_trigger_kprobe);
}
static void trigger_fentry_setup()
{
setup_ctx();
attach_bpf(ctx.skel->progs.bench_trigger_fentry);
}
static void trigger_fmodret_setup()
{
setup_ctx();
attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
}
static void *trigger_consumer(void *input)
{
return NULL;
}
const struct bench bench_trig_base = {
.name = "trig-base",
.validate = trigger_validate,
.producer_thread = trigger_base_producer,
.consumer_thread = trigger_consumer,
.measure = trigger_base_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_tp = {
.name = "trig-tp",
.validate = trigger_validate,
.setup = trigger_tp_setup,
.producer_thread = trigger_producer,
.consumer_thread = trigger_consumer,
.measure = trigger_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_rawtp = {
.name = "trig-rawtp",
.validate = trigger_validate,
.setup = trigger_rawtp_setup,
.producer_thread = trigger_producer,
.consumer_thread = trigger_consumer,
.measure = trigger_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_kprobe = {
.name = "trig-kprobe",
.validate = trigger_validate,
.setup = trigger_kprobe_setup,
.producer_thread = trigger_producer,
.consumer_thread = trigger_consumer,
.measure = trigger_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_fentry = {
.name = "trig-fentry",
.validate = trigger_validate,
.setup = trigger_fentry_setup,
.producer_thread = trigger_producer,
.consumer_thread = trigger_consumer,
.measure = trigger_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_trig_fmodret = {
.name = "trig-fmodret",
.validate = trigger_validate,
.setup = trigger_fmodret_setup,
.producer_thread = trigger_producer,
.consumer_thread = trigger_consumer,
.measure = trigger_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};

View File

@ -0,0 +1,9 @@
#!/bin/bash
set -eufo pipefail
for i in base kprobe kretprobe rawtp fentry fexit fmodret
do
summary=$(sudo ./bench -w2 -d5 -a rename-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
printf "%-10s: %s\n" $i "$summary"
done

View File

@ -0,0 +1,9 @@
#!/bin/bash
set -eufo pipefail
for i in base tp rawtp kprobe fentry fmodret
do
summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
printf "%-10s: %s\n" $i "$summary"
done

View File

@ -0,0 +1,158 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/epoll.h>
#include <linux/err.h>
#include <linux/in.h>
#include <linux/in6.h>
#include "bpf_util.h"
#include "network_helpers.h"
#define clean_errno() (errno == 0 ? "None" : strerror(errno))
#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
__FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
struct ipv4_packet pkt_v4 = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
.iph.ihl = 5,
.iph.protocol = IPPROTO_TCP,
.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
.tcp.urg_ptr = 123,
.tcp.doff = 5,
};
struct ipv6_packet pkt_v6 = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
.iph.nexthdr = IPPROTO_TCP,
.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
.tcp.urg_ptr = 123,
.tcp.doff = 5,
};
int start_server(int family, int type)
{
struct sockaddr_storage addr = {};
socklen_t len;
int fd;
if (family == AF_INET) {
struct sockaddr_in *sin = (void *)&addr;
sin->sin_family = AF_INET;
len = sizeof(*sin);
} else {
struct sockaddr_in6 *sin6 = (void *)&addr;
sin6->sin6_family = AF_INET6;
len = sizeof(*sin6);
}
fd = socket(family, type | SOCK_NONBLOCK, 0);
if (fd < 0) {
log_err("Failed to create server socket");
return -1;
}
if (bind(fd, (const struct sockaddr *)&addr, len) < 0) {
log_err("Failed to bind socket");
close(fd);
return -1;
}
if (type == SOCK_STREAM) {
if (listen(fd, 1) < 0) {
log_err("Failed to listed on socket");
close(fd);
return -1;
}
}
return fd;
}
static const struct timeval timeo_sec = { .tv_sec = 3 };
static const size_t timeo_optlen = sizeof(timeo_sec);
int connect_to_fd(int family, int type, int server_fd)
{
int fd, save_errno;
fd = socket(family, type, 0);
if (fd < 0) {
log_err("Failed to create client socket");
return -1;
}
if (connect_fd_to_fd(fd, server_fd) < 0 && errno != EINPROGRESS) {
save_errno = errno;
close(fd);
errno = save_errno;
return -1;
}
return fd;
}
int connect_fd_to_fd(int client_fd, int server_fd)
{
struct sockaddr_storage addr;
socklen_t len = sizeof(addr);
int save_errno;
if (setsockopt(client_fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
timeo_optlen)) {
log_err("Failed to set SO_RCVTIMEO");
return -1;
}
if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
log_err("Failed to get server addr");
return -1;
}
if (connect(client_fd, (const struct sockaddr *)&addr, len) < 0) {
if (errno != EINPROGRESS) {
save_errno = errno;
log_err("Failed to connect to server");
errno = save_errno;
}
return -1;
}
return 0;
}
int connect_wait(int fd)
{
struct epoll_event ev = {}, events[2];
int timeout_ms = 1000;
int efd, nfd;
efd = epoll_create1(EPOLL_CLOEXEC);
if (efd < 0) {
log_err("Failed to open epoll fd");
return -1;
}
ev.events = EPOLLRDHUP | EPOLLOUT;
ev.data.fd = fd;
if (epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev) < 0) {
log_err("Failed to register fd=%d on epoll fd=%d", fd, efd);
close(efd);
return -1;
}
nfd = epoll_wait(efd, events, ARRAY_SIZE(events), timeout_ms);
if (nfd < 0)
log_err("Failed to wait for I/O event on epoll fd=%d", efd);
close(efd);
return nfd;
}

View File

@ -0,0 +1,41 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NETWORK_HELPERS_H
#define __NETWORK_HELPERS_H
#include <sys/socket.h>
#include <sys/types.h>
#include <linux/types.h>
typedef __u16 __sum16;
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <netinet/tcp.h>
#include <bpf/bpf_endian.h>
#define MAGIC_VAL 0x1234
#define NUM_ITER 100000
#define VIP_NUM 5
#define MAGIC_BYTES 123
/* ipv4 test vector */
struct ipv4_packet {
struct ethhdr eth;
struct iphdr iph;
struct tcphdr tcp;
} __packed;
extern struct ipv4_packet pkt_v4;
/* ipv6 test vector */
struct ipv6_packet {
struct ethhdr eth;
struct ipv6hdr iph;
struct tcphdr tcp;
} __packed;
extern struct ipv6_packet pkt_v6;
int start_server(int family, int type);
int connect_to_fd(int family, int type, int server_fd);
int connect_fd_to_fd(int client_fd, int server_fd);
int connect_wait(int client_fd);
#endif

View File

@ -0,0 +1,409 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include <test_progs.h>
#include "bpf_iter_ipv6_route.skel.h"
#include "bpf_iter_netlink.skel.h"
#include "bpf_iter_bpf_map.skel.h"
#include "bpf_iter_task.skel.h"
#include "bpf_iter_task_file.skel.h"
#include "bpf_iter_test_kern1.skel.h"
#include "bpf_iter_test_kern2.skel.h"
#include "bpf_iter_test_kern3.skel.h"
#include "bpf_iter_test_kern4.skel.h"
static int duration;
static void test_btf_id_or_null(void)
{
struct bpf_iter_test_kern3 *skel;
skel = bpf_iter_test_kern3__open_and_load();
if (CHECK(skel, "bpf_iter_test_kern3__open_and_load",
"skeleton open_and_load unexpectedly succeeded\n")) {
bpf_iter_test_kern3__destroy(skel);
return;
}
}
static void do_dummy_read(struct bpf_program *prog)
{
struct bpf_link *link;
char buf[16] = {};
int iter_fd, len;
link = bpf_program__attach_iter(prog, NULL);
if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
return;
iter_fd = bpf_iter_create(bpf_link__fd(link));
if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
goto free_link;
/* not check contents, but ensure read() ends without error */
while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
;
CHECK(len < 0, "read", "read failed: %s\n", strerror(errno));
close(iter_fd);
free_link:
bpf_link__destroy(link);
}
static void test_ipv6_route(void)
{
struct bpf_iter_ipv6_route *skel;
skel = bpf_iter_ipv6_route__open_and_load();
if (CHECK(!skel, "bpf_iter_ipv6_route__open_and_load",
"skeleton open_and_load failed\n"))
return;
do_dummy_read(skel->progs.dump_ipv6_route);
bpf_iter_ipv6_route__destroy(skel);
}
static void test_netlink(void)
{
struct bpf_iter_netlink *skel;
skel = bpf_iter_netlink__open_and_load();
if (CHECK(!skel, "bpf_iter_netlink__open_and_load",
"skeleton open_and_load failed\n"))
return;
do_dummy_read(skel->progs.dump_netlink);
bpf_iter_netlink__destroy(skel);
}
static void test_bpf_map(void)
{
struct bpf_iter_bpf_map *skel;
skel = bpf_iter_bpf_map__open_and_load();
if (CHECK(!skel, "bpf_iter_bpf_map__open_and_load",
"skeleton open_and_load failed\n"))
return;
do_dummy_read(skel->progs.dump_bpf_map);
bpf_iter_bpf_map__destroy(skel);
}
static void test_task(void)
{
struct bpf_iter_task *skel;
skel = bpf_iter_task__open_and_load();
if (CHECK(!skel, "bpf_iter_task__open_and_load",
"skeleton open_and_load failed\n"))
return;
do_dummy_read(skel->progs.dump_task);
bpf_iter_task__destroy(skel);
}
static void test_task_file(void)
{
struct bpf_iter_task_file *skel;
skel = bpf_iter_task_file__open_and_load();
if (CHECK(!skel, "bpf_iter_task_file__open_and_load",
"skeleton open_and_load failed\n"))
return;
do_dummy_read(skel->progs.dump_task_file);
bpf_iter_task_file__destroy(skel);
}
/* The expected string is less than 16 bytes */
static int do_read_with_fd(int iter_fd, const char *expected,
bool read_one_char)
{
int err = -1, len, read_buf_len, start;
char buf[16] = {};
read_buf_len = read_one_char ? 1 : 16;
start = 0;
while ((len = read(iter_fd, buf + start, read_buf_len)) > 0) {
start += len;
if (CHECK(start >= 16, "read", "read len %d\n", len))
return -1;
read_buf_len = read_one_char ? 1 : 16 - start;
}
if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
return -1;
err = strcmp(buf, expected);
if (CHECK(err, "read", "incorrect read result: buf %s, expected %s\n",
buf, expected))
return -1;
return 0;
}
static void test_anon_iter(bool read_one_char)
{
struct bpf_iter_test_kern1 *skel;
struct bpf_link *link;
int iter_fd, err;
skel = bpf_iter_test_kern1__open_and_load();
if (CHECK(!skel, "bpf_iter_test_kern1__open_and_load",
"skeleton open_and_load failed\n"))
return;
err = bpf_iter_test_kern1__attach(skel);
if (CHECK(err, "bpf_iter_test_kern1__attach",
"skeleton attach failed\n")) {
goto out;
}
link = skel->links.dump_task;
iter_fd = bpf_iter_create(bpf_link__fd(link));
if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
goto out;
do_read_with_fd(iter_fd, "abcd", read_one_char);
close(iter_fd);
out:
bpf_iter_test_kern1__destroy(skel);
}
static int do_read(const char *path, const char *expected)
{
int err, iter_fd;
iter_fd = open(path, O_RDONLY);
if (CHECK(iter_fd < 0, "open", "open %s failed: %s\n",
path, strerror(errno)))
return -1;
err = do_read_with_fd(iter_fd, expected, false);
close(iter_fd);
return err;
}
static void test_file_iter(void)
{
const char *path = "/sys/fs/bpf/bpf_iter_test1";
struct bpf_iter_test_kern1 *skel1;
struct bpf_iter_test_kern2 *skel2;
struct bpf_link *link;
int err;
skel1 = bpf_iter_test_kern1__open_and_load();
if (CHECK(!skel1, "bpf_iter_test_kern1__open_and_load",
"skeleton open_and_load failed\n"))
return;
link = bpf_program__attach_iter(skel1->progs.dump_task, NULL);
if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
goto out;
/* unlink this path if it exists. */
unlink(path);
err = bpf_link__pin(link, path);
if (CHECK(err, "pin_iter", "pin_iter to %s failed: %d\n", path, err))
goto free_link;
err = do_read(path, "abcd");
if (err)
goto unlink_path;
/* file based iterator seems working fine. Let us a link update
* of the underlying link and `cat` the iterator again, its content
* should change.
*/
skel2 = bpf_iter_test_kern2__open_and_load();
if (CHECK(!skel2, "bpf_iter_test_kern2__open_and_load",
"skeleton open_and_load failed\n"))
goto unlink_path;
err = bpf_link__update_program(link, skel2->progs.dump_task);
if (CHECK(err, "update_prog", "update_prog failed\n"))
goto destroy_skel2;
do_read(path, "ABCD");
destroy_skel2:
bpf_iter_test_kern2__destroy(skel2);
unlink_path:
unlink(path);
free_link:
bpf_link__destroy(link);
out:
bpf_iter_test_kern1__destroy(skel1);
}
static void test_overflow(bool test_e2big_overflow, bool ret1)
{
__u32 map_info_len, total_read_len, expected_read_len;
int err, iter_fd, map1_fd, map2_fd, len;
struct bpf_map_info map_info = {};
struct bpf_iter_test_kern4 *skel;
struct bpf_link *link;
__u32 page_size;
char *buf;
skel = bpf_iter_test_kern4__open();
if (CHECK(!skel, "bpf_iter_test_kern4__open",
"skeleton open failed\n"))
return;
/* create two maps: bpf program will only do bpf_seq_write
* for these two maps. The goal is one map output almost
* fills seq_file buffer and then the other will trigger
* overflow and needs restart.
*/
map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
if (CHECK(map1_fd < 0, "bpf_create_map",
"map_creation failed: %s\n", strerror(errno)))
goto out;
map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
if (CHECK(map2_fd < 0, "bpf_create_map",
"map_creation failed: %s\n", strerror(errno)))
goto free_map1;
/* bpf_seq_printf kernel buffer is one page, so one map
* bpf_seq_write will mostly fill it, and the other map
* will partially fill and then trigger overflow and need
* bpf_seq_read restart.
*/
page_size = sysconf(_SC_PAGE_SIZE);
if (test_e2big_overflow) {
skel->rodata->print_len = (page_size + 8) / 8;
expected_read_len = 2 * (page_size + 8);
} else if (!ret1) {
skel->rodata->print_len = (page_size - 8) / 8;
expected_read_len = 2 * (page_size - 8);
} else {
skel->rodata->print_len = 1;
expected_read_len = 2 * 8;
}
skel->rodata->ret1 = ret1;
if (CHECK(bpf_iter_test_kern4__load(skel),
"bpf_iter_test_kern4__load", "skeleton load failed\n"))
goto free_map2;
/* setup filtering map_id in bpf program */
map_info_len = sizeof(map_info);
err = bpf_obj_get_info_by_fd(map1_fd, &map_info, &map_info_len);
if (CHECK(err, "get_map_info", "get map info failed: %s\n",
strerror(errno)))
goto free_map2;
skel->bss->map1_id = map_info.id;
err = bpf_obj_get_info_by_fd(map2_fd, &map_info, &map_info_len);
if (CHECK(err, "get_map_info", "get map info failed: %s\n",
strerror(errno)))
goto free_map2;
skel->bss->map2_id = map_info.id;
link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL);
if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
goto free_map2;
iter_fd = bpf_iter_create(bpf_link__fd(link));
if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
goto free_link;
buf = malloc(expected_read_len);
if (!buf)
goto close_iter;
/* do read */
total_read_len = 0;
if (test_e2big_overflow) {
while ((len = read(iter_fd, buf, expected_read_len)) > 0)
total_read_len += len;
CHECK(len != -1 || errno != E2BIG, "read",
"expected ret -1, errno E2BIG, but get ret %d, error %s\n",
len, strerror(errno));
goto free_buf;
} else if (!ret1) {
while ((len = read(iter_fd, buf, expected_read_len)) > 0)
total_read_len += len;
if (CHECK(len < 0, "read", "read failed: %s\n",
strerror(errno)))
goto free_buf;
} else {
do {
len = read(iter_fd, buf, expected_read_len);
if (len > 0)
total_read_len += len;
} while (len > 0 || len == -EAGAIN);
if (CHECK(len < 0, "read", "read failed: %s\n",
strerror(errno)))
goto free_buf;
}
if (CHECK(total_read_len != expected_read_len, "read",
"total len %u, expected len %u\n", total_read_len,
expected_read_len))
goto free_buf;
if (CHECK(skel->bss->map1_accessed != 1, "map1_accessed",
"expected 1 actual %d\n", skel->bss->map1_accessed))
goto free_buf;
if (CHECK(skel->bss->map2_accessed != 2, "map2_accessed",
"expected 2 actual %d\n", skel->bss->map2_accessed))
goto free_buf;
CHECK(skel->bss->map2_seqnum1 != skel->bss->map2_seqnum2,
"map2_seqnum", "two different seqnum %lld %lld\n",
skel->bss->map2_seqnum1, skel->bss->map2_seqnum2);
free_buf:
free(buf);
close_iter:
close(iter_fd);
free_link:
bpf_link__destroy(link);
free_map2:
close(map2_fd);
free_map1:
close(map1_fd);
out:
bpf_iter_test_kern4__destroy(skel);
}
void test_bpf_iter(void)
{
if (test__start_subtest("btf_id_or_null"))
test_btf_id_or_null();
if (test__start_subtest("ipv6_route"))
test_ipv6_route();
if (test__start_subtest("netlink"))
test_netlink();
if (test__start_subtest("bpf_map"))
test_bpf_map();
if (test__start_subtest("task"))
test_task();
if (test__start_subtest("task_file"))
test_task_file();
if (test__start_subtest("anon"))
test_anon_iter(false);
if (test__start_subtest("anon-read-one-char"))
test_anon_iter(true);
if (test__start_subtest("file"))
test_file_iter();
if (test__start_subtest("overflow"))
test_overflow(false, false);
if (test__start_subtest("overflow-e2big"))
test_overflow(true, false);
if (test__start_subtest("prog-ret-1"))
test_overflow(false, true);
}

View File

@ -0,0 +1,95 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2020 Facebook
#include <test_progs.h>
#include "network_helpers.h"
#include "cgroup_skb_sk_lookup_kern.skel.h"
static void run_lookup_test(__u16 *g_serv_port, int out_sk)
{
int serv_sk = -1, in_sk = -1, serv_in_sk = -1, err;
struct sockaddr_in6 addr = {};
socklen_t addr_len = sizeof(addr);
__u32 duration = 0;
serv_sk = start_server(AF_INET6, SOCK_STREAM);
if (CHECK(serv_sk < 0, "start_server", "failed to start server\n"))
return;
err = getsockname(serv_sk, (struct sockaddr *)&addr, &addr_len);
if (CHECK(err, "getsockname", "errno %d\n", errno))
goto cleanup;
*g_serv_port = addr.sin6_port;
/* Client outside of test cgroup should fail to connect by timeout. */
err = connect_fd_to_fd(out_sk, serv_sk);
if (CHECK(!err || errno != EINPROGRESS, "connect_fd_to_fd",
"unexpected result err %d errno %d\n", err, errno))
goto cleanup;
err = connect_wait(out_sk);
if (CHECK(err, "connect_wait", "unexpected result %d\n", err))
goto cleanup;
/* Client inside test cgroup should connect just fine. */
in_sk = connect_to_fd(AF_INET6, SOCK_STREAM, serv_sk);
if (CHECK(in_sk < 0, "connect_to_fd", "errno %d\n", errno))
goto cleanup;
serv_in_sk = accept(serv_sk, NULL, NULL);
if (CHECK(serv_in_sk < 0, "accept", "errno %d\n", errno))
goto cleanup;
cleanup:
close(serv_in_sk);
close(in_sk);
close(serv_sk);
}
static void run_cgroup_bpf_test(const char *cg_path, int out_sk)
{
struct cgroup_skb_sk_lookup_kern *skel;
struct bpf_link *link;
__u32 duration = 0;
int cgfd = -1;
skel = cgroup_skb_sk_lookup_kern__open_and_load();
if (CHECK(!skel, "skel_open_load", "open_load failed\n"))
return;
cgfd = test__join_cgroup(cg_path);
if (CHECK(cgfd < 0, "cgroup_join", "cgroup setup failed\n"))
goto cleanup;
link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd);
if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link)))
goto cleanup;
run_lookup_test(&skel->bss->g_serv_port, out_sk);
bpf_link__destroy(link);
cleanup:
close(cgfd);
cgroup_skb_sk_lookup_kern__destroy(skel);
}
void test_cgroup_skb_sk_lookup(void)
{
const char *cg_path = "/foo";
int out_sk;
/* Create a socket before joining testing cgroup so that its cgroup id
* differs from that of testing cgroup. Moving selftests process to
* testing cgroup won't change cgroup id of an already created socket.
*/
out_sk = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0);
if (CHECK_FAIL(out_sk < 0))
return;
run_cgroup_bpf_test(cg_path, out_sk);
close(out_sk);
}

View File

@ -0,0 +1,115 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
#include "cgroup_helpers.h"
#include "network_helpers.h"
static int verify_port(int family, int fd, int expected)
{
struct sockaddr_storage addr;
socklen_t len = sizeof(addr);
__u16 port;
if (getsockname(fd, (struct sockaddr *)&addr, &len)) {
log_err("Failed to get server addr");
return -1;
}
if (family == AF_INET)
port = ((struct sockaddr_in *)&addr)->sin_port;
else
port = ((struct sockaddr_in6 *)&addr)->sin6_port;
if (ntohs(port) != expected) {
log_err("Unexpected port %d, expected %d", ntohs(port),
expected);
return -1;
}
return 0;
}
static int run_test(int cgroup_fd, int server_fd, int family, int type)
{
struct bpf_prog_load_attr attr = {
.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
};
struct bpf_object *obj;
int expected_port;
int prog_fd;
int err;
int fd;
if (family == AF_INET) {
attr.file = "./connect_force_port4.o";
attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT;
expected_port = 22222;
} else {
attr.file = "./connect_force_port6.o";
attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT;
expected_port = 22223;
}
err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
if (err) {
log_err("Failed to load BPF object");
return -1;
}
err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type,
0);
if (err) {
log_err("Failed to attach BPF program");
goto close_bpf_object;
}
fd = connect_to_fd(family, type, server_fd);
if (fd < 0) {
err = -1;
goto close_bpf_object;
}
err = verify_port(family, fd, expected_port);
close(fd);
close_bpf_object:
bpf_object__close(obj);
return err;
}
void test_connect_force_port(void)
{
int server_fd, cgroup_fd;
cgroup_fd = test__join_cgroup("/connect_force_port");
if (CHECK_FAIL(cgroup_fd < 0))
return;
server_fd = start_server(AF_INET, SOCK_STREAM);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM));
close(server_fd);
server_fd = start_server(AF_INET6, SOCK_STREAM);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM));
close(server_fd);
server_fd = start_server(AF_INET, SOCK_DGRAM);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM));
close(server_fd);
server_fd = start_server(AF_INET6, SOCK_DGRAM);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM));
close(server_fd);
close_cgroup_fd:
close(cgroup_fd);
}

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
#include <test_progs.h>
#include <network_helpers.h>
static void test_fexit_bpf2bpf_common(const char *obj_file,
const char *target_obj_file,

View File

@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
#include <network_helpers.h>
#include <error.h>
#include <linux/if.h>
#include <linux/if_tun.h>

Some files were not shown because too many files have changed in this diff Show More