From d96608794889d5a2419fb4077fa5d5bbffea07bc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Aug 2024 14:00:53 +0200 Subject: [PATCH 01/12] netkit: Disable netpoll support Follow-up to 45160cebd6ac ("net: veth: Disable netpoll support") to also disable netpoll for netkit interfaces. Same conditions apply here as well. Signed-off-by: Daniel Borkmann Cc: Breno Leitao Cc: Nikolay Aleksandrov Acked-by: Nikolay Aleksandrov Reviewed-by: Breno Leitao Link: https://lore.kernel.org/r/eab2d69ba2f4c260aef62e4ff0d803e9f60c2c5d.1724414250.git.daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/netkit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 16789cd446e9..0681cf86284d 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -255,6 +255,7 @@ static void netkit_setup(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->priv_flags |= IFF_PHONY_HEADROOM; dev->priv_flags |= IFF_NO_QUEUE; + dev->priv_flags |= IFF_DISABLE_NETPOLL; dev->ethtool_ops = &netkit_ethtool_ops; dev->netdev_ops = &netkit_netdev_ops; From 731733c6234855b26d468aa2f89a3051c170e63a Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 29 Aug 2024 16:45:51 +0100 Subject: [PATCH 02/12] bpf, sockmap: Correct spelling skmsg.c Correct spelling in skmsg.c. As reported by codespell. Signed-off-by: Simon Horman Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240829-sockmap-spell-v1-1-a614d76564cc@kernel.org --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index bbf40b999713..b1dcbd3be89e 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -293,7 +293,7 @@ out: /* If we trim data a full sg elem before curr pointer update * copybreak and current so that any future copy operations * start at new copy location. - * However trimed data that has not yet been used in a copy op + * However trimmed data that has not yet been used in a copy op * does not require an update. */ if (!msg->sg.size) { From 5d1622831064abf2633a1bc9a6446e83a669f58d Mon Sep 17 00:00:00 2001 From: Yaxin Chen Date: Fri, 23 Aug 2024 15:48:43 -0700 Subject: [PATCH 03/12] tcp_bpf: Remove an unused parameter for bpf_tcp_ingress() Parameter flags is not used in bpf_tcp_ingress(). Signed-off-by: Yaxin Chen Signed-off-by: Daniel Borkmann Reviewed-by: Cong Wang Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20240823224843.1985277-1-yaxin.chen1@bytedance.com --- net/ipv4/tcp_bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 53b0d62fd2c2..57a1614c55f9 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -30,7 +30,7 @@ void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) } static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, - struct sk_msg *msg, u32 apply_bytes, int flags) + struct sk_msg *msg, u32 apply_bytes) { bool apply = apply_bytes; struct scatterlist *sge; @@ -167,7 +167,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, if (unlikely(!psock)) return -EPIPE; - ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock); return ret; From 6b083650a37318112fb60c65fbb6070584f53d93 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 4 Sep 2024 18:28:08 +0200 Subject: [PATCH 04/12] xsk: Bump xsk_queue::queue_empty_descs in xp_can_alloc() We have STAT_FILL_EMPTY test case in xskxceiver that tries to process traffic with fill queue being empty which currently fails for zero copy ice driver after it started to use xsk_buff_can_alloc() API. That is because xsk_queue::queue_empty_descs is currently only increased from alloc APIs and right now if driver sees that xsk_buff_pool will be unable to provide the requested count of buffers, it bails out early, skipping calls to xsk_buff_alloc{_batch}(). Mentioned statistic should be handled in xsk_buff_can_alloc() from the very beginning, so let's add this logic now. Do it by open coding xskq_cons_has_entries() and bumping queue_empty_descs in the middle when fill queue currently has no entries. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240904162808.249160-1-maciej.fijalkowski@intel.com --- net/xdp/xsk_buff_pool.c | 10 +++++++++- net/xdp/xsk_queue.h | 5 ----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index c0e0204b9630..dad7849c1b75 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -656,9 +656,17 @@ EXPORT_SYMBOL(xp_alloc_batch); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) { + u32 req_count, avail_count; + if (pool->free_list_cnt >= count) return true; - return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); + + req_count = count - pool->free_list_cnt; + avail_count = xskq_cons_nb_entries(pool->fq, req_count); + if (!avail_count) + pool->fq->queue_empty_descs++; + + return avail_count >= req_count; } EXPORT_SYMBOL(xp_can_alloc); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6f2d1621c992..406b20dfee8d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -306,11 +306,6 @@ static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) return entries >= max ? max : entries; } -static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) -{ - return xskq_cons_nb_entries(q, cnt) >= cnt; -} - static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) From d41905b3bb890a32c87b65228b241daad8837fb0 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 10 Sep 2024 14:41:29 +0200 Subject: [PATCH 05/12] selftests/xsk: Read current MAX_SKB_FRAGS from sysctl knob Currently, xskxceiver assumes that MAX_SKB_FRAGS value is always 17 which is not true - since the introduction of BIG TCP this can now take any value between 17 to 45 via CONFIG_MAX_SKB_FRAGS. Adjust the TOO_MANY_FRAGS test case to read the currently configured MAX_SKB_FRAGS value by reading it from /proc/sys/net/core/max_skb_frags. If running system does not provide that sysctl file then let us try running the test with a default value. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240910124129.289874-1-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 43 +++++++++++++++++++++--- tools/testing/selftests/bpf/xskxceiver.h | 1 - 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 8144fd145237..1ee0ef114f9d 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -324,6 +324,25 @@ out: return zc_avail; } +#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags" +static unsigned int get_max_skb_frags(void) +{ + unsigned int max_skb_frags = 0; + FILE *file; + + file = fopen(MAX_SKB_FRAGS_PATH, "r"); + if (!file) { + ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH); + return 0; + } + + if (fscanf(file, "%u", &max_skb_frags) != 1) + ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH); + + fclose(file); + return max_skb_frags; +} + static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"busy-poll", no_argument, 0, 'b'}, @@ -2244,13 +2263,24 @@ static int testapp_poll_rxq_tmout(struct test_spec *test) static int testapp_too_many_frags(struct test_spec *test) { - struct pkt pkts[2 * XSK_DESC__MAX_SKB_FRAGS + 2] = {}; + struct pkt *pkts; u32 max_frags, i; + int ret; - if (test->mode == TEST_MODE_ZC) + if (test->mode == TEST_MODE_ZC) { max_frags = test->ifobj_tx->xdp_zc_max_segs; - else - max_frags = XSK_DESC__MAX_SKB_FRAGS; + } else { + max_frags = get_max_skb_frags(); + if (!max_frags) { + ksft_print_msg("Couldn't retrieve MAX_SKB_FRAGS from system, using default (17) value\n"); + max_frags = 17; + } + max_frags += 1; + } + + pkts = calloc(2 * max_frags + 2, sizeof(struct pkt)); + if (!pkts) + return TEST_FAILURE; test->mtu = MAX_ETH_JUMBO_SIZE; @@ -2280,7 +2310,10 @@ static int testapp_too_many_frags(struct test_spec *test) pkts[2 * max_frags + 1].valid = true; pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2); - return testapp_validate_traffic(test); + ret = testapp_validate_traffic(test); + + free(pkts); + return ret; } static int xsk_load_xdp_programs(struct ifobject *ifobj) diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index 885c948c5d83..e46e823f6a1a 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -55,7 +55,6 @@ #define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024) #define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024) #define XSK_DESC__INVALID_OPTION (0xffff) -#define XSK_DESC__MAX_SKB_FRAGS 18 #define HUGEPAGE_SIZE (2 * 1024 * 1024) #define PKT_DUMP_NB_TO_PRINT 16 #define RUN_ALL_TESTS UINT_MAX From 23dc9867329c72b48e5039ac93fbf50d9099cdb3 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 5 Sep 2024 19:22:44 -0600 Subject: [PATCH 06/12] bpf, cpumap: Move xdp:xdp_cpumap_kthread tracepoint before rcv cpumap takes RX processing out of softirq and onto a separate kthread. Since the kthread needs to be scheduled in order to run (versus softirq which does not), we can theoretically experience extra latency if the system is under load and the scheduler is being unfair to us. Moving the tracepoint to before passing the skb list up the stack allows users to more accurately measure enqueue/dequeue latency introduced by cpumap via xdp:xdp_cpumap_enqueue and xdp:xdp_cpumap_kthread tracepoints. f9419f7bd7a5 ("bpf: cpumap add tracepoints") which added the tracepoints states that the intent behind them was for general observability and for a feedback loop to see if the queues are being overwhelmed. This change does not mess with either of those use cases but rather adds a third one. Signed-off-by: Daniel Xu Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/47615d5b5e302e4bd30220473779e98b492d47cd.1725585718.git.dxu@dxuuu.xyz --- kernel/bpf/cpumap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index fbdf5a1aabfe..a2f46785ac3b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -354,12 +354,14 @@ static int cpu_map_kthread_run(void *data) list_add_tail(&skb->list, &list); } - netif_receive_skb_list(&list); - /* Feedback loop via tracepoint */ + /* Feedback loop via tracepoint. + * NB: keep before recv to allow measuring enqueue/dequeue latency. + */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); + netif_receive_skb_list(&list); local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); From 8aeaed21befc90f27f4fca6dd190850d97d2e9e3 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Wed, 11 Sep 2024 11:37:15 +0800 Subject: [PATCH 07/12] bpf: Support __nullable argument suffix for tp_btf Pointers passed to tp_btf were trusted to be valid, but some tracepoints do take NULL pointer as input, such as trace_tcp_send_reset(). Then the invalid memory access cannot be detected by verifier. This patch fix it by add a suffix "__nullable" to the unreliable argument. The suffix is shown in btf, and PTR_MAYBE_NULL will be added to nullable arguments. Then users must check the pointer before use it. A problem here is that we use "btf_trace_##call" to search func_proto. As it is a typedef, argument names as well as the suffix are not recorded. To solve this, I use bpf_raw_event_map to find "__bpf_trace##template" from "btf_trace_##call", and then we can see the suffix. Suggested-by: Alexei Starovoitov Signed-off-by: Philo Lu Link: https://lore.kernel.org/r/20240911033719.91468-2-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 3 +++ kernel/bpf/verifier.c | 36 ++++++++++++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 520f49f422fe..7e03a98ccb7c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6523,6 +6523,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; + if (btf_param_match_suffix(btf, &args[arg], "__nullable")) + info->reg_type |= PTR_MAYBE_NULL; + if (tgt_prog) { enum bpf_prog_type tgt_type; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d8520095ca03..39d5710c68ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include "disasm.h" @@ -21154,11 +21156,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, { bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING; + char trace_symbol[KSYM_SYMBOL_LEN]; const char prefix[] = "btf_trace_"; + struct bpf_raw_event_map *btp; int ret = 0, subprog = -1, i; const struct btf_type *t; bool conservative = true; - const char *tname; + const char *tname, *fname; struct btf *btf; long addr = 0; struct module *mod = NULL; @@ -21289,10 +21293,34 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, return -EINVAL; } tname += sizeof(prefix) - 1; - t = btf_type_by_id(btf, t->type); - if (!btf_type_is_ptr(t)) - /* should never happen in valid vmlinux build */ + + /* The func_proto of "btf_trace_##tname" is generated from typedef without argument + * names. Thus using bpf_raw_event_map to get argument names. + */ + btp = bpf_get_raw_tracepoint(tname); + if (!btp) return -EINVAL; + fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL, + trace_symbol); + bpf_put_raw_tracepoint(btp); + + if (fname) + ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC); + + if (!fname || ret < 0) { + bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n", + prefix, tname); + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_ptr(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + } else { + t = btf_type_by_id(btf, ret); + if (!btf_type_is_func(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) /* should never happen in valid vmlinux build */ From 2060f07f861a237345922023e9347a204c0795af Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Wed, 11 Sep 2024 11:37:16 +0800 Subject: [PATCH 08/12] selftests/bpf: Add test for __nullable suffix in tp_btf Add a tracepoint with __nullable suffix in bpf_testmod, and add cases for it: $ ./test_progs -t "tp_btf_nullable" #406/1 tp_btf_nullable/handle_tp_btf_nullable_bare1:OK #406/2 tp_btf_nullable/handle_tp_btf_nullable_bare2:OK #406 tp_btf_nullable:OK Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Philo Lu Link: https://lore.kernel.org/r/20240911033719.91468-3-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- .../bpf/bpf_testmod/bpf_testmod-events.h | 6 +++++ .../selftests/bpf/bpf_testmod/bpf_testmod.c | 2 ++ .../bpf/prog_tests/tp_btf_nullable.c | 14 +++++++++++ .../bpf/progs/test_tp_btf_nullable.c | 24 +++++++++++++++++++ 4 files changed, 46 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c create mode 100644 tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h index 11ee801e75e7..6c3b4d4f173a 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h @@ -34,6 +34,12 @@ DECLARE_TRACE(bpf_testmod_test_write_bare, TP_ARGS(task, ctx) ); +/* Used in bpf_testmod_test_read() to test __nullable suffix */ +DECLARE_TRACE(bpf_testmod_test_nullable_bare, + TP_PROTO(struct bpf_testmod_test_read_ctx *ctx__nullable), + TP_ARGS(ctx__nullable) +); + #undef BPF_TESTMOD_DECLARE_TRACE #ifdef DECLARE_TRACE_WRITABLE #define BPF_TESTMOD_DECLARE_TRACE(call, proto, args, size) \ diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index fd28c1157bd3..a32771da4293 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -356,6 +356,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, if (bpf_testmod_loop_test(101) > 100) trace_bpf_testmod_test_read(current, &ctx); + trace_bpf_testmod_test_nullable_bare(NULL); + /* Magic number to enable writable tp */ if (len == 64) { struct bpf_testmod_test_writable_ctx writable = { diff --git a/tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c b/tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c new file mode 100644 index 000000000000..accc42e01f8a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "test_tp_btf_nullable.skel.h" + +void test_tp_btf_nullable(void) +{ + if (!env.has_testmod) { + test__skip(); + return; + } + + RUN_TESTS(test_tp_btf_nullable); +} diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c new file mode 100644 index 000000000000..bba3e37f749b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include +#include "../bpf_testmod/bpf_testmod.h" +#include "bpf_misc.h" + +SEC("tp_btf/bpf_testmod_test_nullable_bare") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx) +{ + return nullable_ctx->len; +} + +SEC("tp_btf/bpf_testmod_test_nullable_bare") +int BPF_PROG(handle_tp_btf_nullable_bare2, struct bpf_testmod_test_read_ctx *nullable_ctx) +{ + if (nullable_ctx) + return nullable_ctx->len; + return 0; +} + +char _license[] SEC("license") = "GPL"; From edd3f6f7588c713477e1299c38c84dcd91a7f148 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Wed, 11 Sep 2024 11:37:17 +0800 Subject: [PATCH 09/12] tcp: Use skb__nullable in trace_tcp_send_reset Replace skb with skb__nullable as the argument name. The suffix tells bpf verifier through btf that the arg could be NULL and should be checked in tp_btf prog. For now, this is the only nullable argument in tcp tracepoints. Signed-off-by: Philo Lu Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240911033719.91468-4-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- include/trace/events/tcp.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1c8bd8e186b8..a27c4b619dff 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -91,10 +91,10 @@ DEFINE_RST_REASON(FN, FN) TRACE_EVENT(tcp_send_reset, TP_PROTO(const struct sock *sk, - const struct sk_buff *skb, + const struct sk_buff *skb__nullable, const enum sk_rst_reason reason), - TP_ARGS(sk, skb, reason), + TP_ARGS(sk, skb__nullable, reason), TP_STRUCT__entry( __field(const void *, skbaddr) @@ -106,7 +106,7 @@ TRACE_EVENT(tcp_send_reset, ), TP_fast_assign( - __entry->skbaddr = skb; + __entry->skbaddr = skb__nullable; __entry->skaddr = sk; /* Zero means unknown state. */ __entry->state = sk ? sk->sk_state : 0; @@ -118,13 +118,13 @@ TRACE_EVENT(tcp_send_reset, const struct inet_sock *inet = inet_sk(sk); TP_STORE_ADDR_PORTS(__entry, inet, sk); - } else if (skb) { - const struct tcphdr *th = (const struct tcphdr *)skb->data; + } else if (skb__nullable) { + const struct tcphdr *th = (const struct tcphdr *)skb__nullable->data; /* * We should reverse the 4-tuple of skb, so later * it can print the right flow direction of rst. */ - TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, entry->saddr); + TP_STORE_ADDR_PORTS_SKB(skb__nullable, th, entry->daddr, entry->saddr); } __entry->reason = reason; ), From ffc83860d8c09705d8e83474b8c6ec4d1d3dca41 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Wed, 11 Sep 2024 11:37:18 +0800 Subject: [PATCH 10/12] bpf: Allow bpf_dynptr_from_skb() for tp_btf Making tp_btf able to use bpf_dynptr_from_skb(), which is useful for skb parsing, especially for non-linear paged skb data. This is achieved by adding KF_TRUSTED_ARGS flag to bpf_dynptr_from_skb and registering it for TRACING progs. With KF_TRUSTED_ARGS, args from fentry/fexit are excluded, so that unsafe progs like fexit/__kfree_skb are not allowed. We also need the skb dynptr to be read-only in tp_btf. Because may_access_direct_pkt_data() returns false by default when checking bpf_dynptr_from_skb, there is no need to add BPF_PROG_TYPE_TRACING to it explicitly. Suggested-by: Martin KaFai Lau Signed-off-by: Philo Lu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240911033719.91468-5-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index f09d875cc053..c6012a3a14b2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12063,7 +12063,7 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) @@ -12112,6 +12112,7 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); From 83dff601715bdc086dc1fc470ee3aaff42215e65 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Wed, 11 Sep 2024 11:37:19 +0800 Subject: [PATCH 11/12] selftests/bpf: Expand skb dynptr selftests for tp_btf Add 3 test cases for skb dynptr used in tp_btf: - test_dynptr_skb_tp_btf: use skb dynptr in tp_btf and make sure it is read-only. - skb_invalid_ctx_fentry/skb_invalid_ctx_fexit: bpf_dynptr_from_skb should fail in fentry/fexit. In test_dynptr_skb_tp_btf, to trigger the tracepoint in kfree_skb, test_pkt_access is used for its test_run, as in kfree_skb.c. Because the test process is different from others, a new setup type is defined, i.e., SETUP_SKB_PROG_TP. The result is like: $ ./test_progs -t 'dynptr/test_dynptr_skb_tp_btf' #84/14 dynptr/test_dynptr_skb_tp_btf:OK #84 dynptr:OK #127 kfunc_dynptr_param:OK Summary: 2/1 PASSED, 0 SKIPPED, 0 FAILED $ ./test_progs -t 'dynptr/skb_invalid_ctx_f' #84/85 dynptr/skb_invalid_ctx_fentry:OK #84/86 dynptr/skb_invalid_ctx_fexit:OK #84 dynptr:OK #127 kfunc_dynptr_param:OK Summary: 2/2 PASSED, 0 SKIPPED, 0 FAILED Also fix two coding style nits (change spaces to tabs). Signed-off-by: Philo Lu Link: https://lore.kernel.org/r/20240911033719.91468-6-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/prog_tests/dynptr.c | 37 ++++++++++++++++++- .../testing/selftests/bpf/progs/dynptr_fail.c | 25 +++++++++++++ .../selftests/bpf/progs/dynptr_success.c | 23 ++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index 7cfac53c0d58..b614a5272dfd 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -9,6 +9,7 @@ enum test_setup_type { SETUP_SYSCALL_SLEEP, SETUP_SKB_PROG, + SETUP_SKB_PROG_TP, }; static struct { @@ -28,6 +29,7 @@ static struct { {"test_dynptr_clone", SETUP_SKB_PROG}, {"test_dynptr_skb_no_buff", SETUP_SKB_PROG}, {"test_dynptr_skb_strcmp", SETUP_SKB_PROG}, + {"test_dynptr_skb_tp_btf", SETUP_SKB_PROG_TP}, }; static void verify_success(const char *prog_name, enum test_setup_type setup_type) @@ -35,7 +37,7 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ struct dynptr_success *skel; struct bpf_program *prog; struct bpf_link *link; - int err; + int err; skel = dynptr_success__open(); if (!ASSERT_OK_PTR(skel, "dynptr_success__open")) @@ -47,7 +49,7 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) goto cleanup; - bpf_program__set_autoload(prog, true); + bpf_program__set_autoload(prog, true); err = dynptr_success__load(skel); if (!ASSERT_OK(err, "dynptr_success__load")) @@ -87,6 +89,37 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ break; } + case SETUP_SKB_PROG_TP: + { + struct __sk_buff skb = {}; + struct bpf_object *obj; + int aux_prog_fd; + + /* Just use its test_run to trigger kfree_skb tracepoint */ + err = bpf_prog_test_load("./test_pkt_access.bpf.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &aux_prog_fd); + if (!ASSERT_OK(err, "prog_load sched cls")) + goto cleanup; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .ctx_in = &skb, + .ctx_size_in = sizeof(skb), + ); + + link = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto cleanup; + + err = bpf_prog_test_run_opts(aux_prog_fd, &topts); + bpf_link__destroy(link); + + if (!ASSERT_OK(err, "test_run")) + goto cleanup; + + break; + } } ASSERT_EQ(skel->bss->err, 0, "err"); diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index e35bc1eac52a..c3bc186af21e 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "bpf_misc.h" #include "bpf_kfuncs.h" @@ -1254,6 +1255,30 @@ int skb_invalid_ctx(void *ctx) return 0; } +SEC("fentry/skb_tx_error") +__failure __msg("must be referenced or trusted") +int BPF_PROG(skb_invalid_ctx_fentry, void *skb) +{ + struct bpf_dynptr ptr; + + /* this should fail */ + bpf_dynptr_from_skb(skb, 0, &ptr); + + return 0; +} + +SEC("fexit/skb_tx_error") +__failure __msg("must be referenced or trusted") +int BPF_PROG(skb_invalid_ctx_fexit, void *skb) +{ + struct bpf_dynptr ptr; + + /* this should fail */ + bpf_dynptr_from_skb(skb, 0, &ptr); + + return 0; +} + /* Reject writes to dynptr slot for uninit arg */ SEC("?raw_tp") __failure __msg("potential write to dynptr at off=-16") diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index 5985920d162e..bfcc85686cf0 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "bpf_misc.h" #include "bpf_kfuncs.h" #include "errno.h" @@ -544,3 +545,25 @@ int test_dynptr_skb_strcmp(struct __sk_buff *skb) return 1; } + +SEC("tp_btf/kfree_skb") +int BPF_PROG(test_dynptr_skb_tp_btf, void *skb, void *location) +{ + __u8 write_data[2] = {1, 2}; + struct bpf_dynptr ptr; + int ret; + + if (bpf_dynptr_from_skb(skb, 0, &ptr)) { + err = 1; + return 1; + } + + /* since tp_btf skbs are read only, writes should fail */ + ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0); + if (ret != -EINVAL) { + err = 2; + return 1; + } + + return 1; +} From b1339be951ad31947ae19bc25cb08769bf255100 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Sep 2024 15:44:49 +0000 Subject: [PATCH 12/12] sock_map: Add a cond_resched() in sock_hash_free() Several syzbot soft lockup reports all have in common sock_hash_free() If a map with a large number of buckets is destroyed, we need to yield the cpu when needed. Fixes: 75e68e5bf2c7 ("bpf, sockhash: Synchronize delete from bucket list on map free") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240906154449.3742932-1-edumazet@google.com --- net/core/sock_map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d3dbb92153f2..724b6856fcc3 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1183,6 +1183,7 @@ static void sock_hash_free(struct bpf_map *map) sock_put(elem->sk); sock_hash_free_elem(htab, elem); } + cond_resched(); } /* wait for psock readers accessing its map link */