From 67059b61597c004e23b074547b40604222bee3a0 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 1 Nov 2023 09:33:51 +0800 Subject: [PATCH 1/6] netfilter: nft_set_rbtree: Remove unused variable nft_net MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code that uses nft_net has been removed, and the nft_pernet function is merely obtaining a reference to shared data through the net pointer. The content of the net pointer is not modified or changed, so both of them should be removed. silence the warning: net/netfilter/nft_set_rbtree.c:627:26: warning: variable ‘nft_net’ set but not used Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7103 Signed-off-by: Yang Li Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 6f1186abd47b..baa3fea4fe65 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -624,14 +624,12 @@ static void nft_rbtree_gc(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; - struct nftables_pernet *nft_net; struct rb_node *node, *next; struct nft_trans_gc *gc; struct net *net; set = nft_set_container_of(priv); net = read_pnet(&set->net); - nft_net = nft_pernet(net); gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) From a44af08e3d4d7566eeea98d7a29fe06e7b9de944 Mon Sep 17 00:00:00 2001 From: Linkui Xiao Date: Wed, 1 Nov 2023 11:20:18 +0800 Subject: [PATCH 2/6] netfilter: nf_conntrack_bridge: initialize err to 0 K2CI reported a problem: consume_skb(skb); return err; [nf_br_ip_fragment() error] uninitialized symbol 'err'. err is not initialized, because returning 0 is expected, initialize err to 0. Fixes: 3c171f496ef5 ("netfilter: bridge: add connection tracking system") Reported-by: k2ci Signed-off-by: Linkui Xiao Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nf_conntrack_bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index b5c406a6e765..abb090f94ed2 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -37,7 +37,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, ktime_t tstamp = skb->tstamp; struct ip_frag_state state; struct iphdr *iph; - int err; + int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL && From c301f0981fdd3fd1ffac6836b423c4d7a8e0eb63 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 3 Nov 2023 09:42:51 +0300 Subject: [PATCH 3/6] netfilter: nf_tables: fix pointer math issue in nft_byteorder_eval() The problem is in nft_byteorder_eval() where we are iterating through a loop and writing to dst[0], dst[1], dst[2] and so on... On each iteration we are writing 8 bytes. But dst[] is an array of u32 so each element only has space for 4 bytes. That means that every iteration overwrites part of the previous element. I spotted this bug while reviewing commit caf3ef7468f7 ("netfilter: nf_tables: prevent OOB access in nft_byteorder_eval") which is a related issue. I think that the reason we have not detected this bug in testing is that most of time we only write one element. Fixes: ce1e7989d989 ("netfilter: nft_byteorder: provide 64bit le/be conversion") Signed-off-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- net/netfilter/nft_byteorder.c | 5 +++-- net/netfilter/nft_meta.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3bbd13ab1ecf..b157c5cafd14 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -178,9 +178,9 @@ static inline __be32 nft_reg_load_be32(const u32 *sreg) return *(__force __be32 *)sreg; } -static inline void nft_reg_store64(u32 *dreg, u64 val) +static inline void nft_reg_store64(u64 *dreg, u64 val) { - put_unaligned(val, (u64 *)dreg); + put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index e596d1a842f7..f6e791a68101 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -38,13 +38,14 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->size) { case 8: { + u64 *dst64 = (void *)dst; u64 src64; switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { src64 = nft_reg_load64(&src[i]); - nft_reg_store64(&dst[i], + nft_reg_store64(&dst64[i], be64_to_cpu((__force __be64)src64)); } break; @@ -52,7 +53,7 @@ void nft_byteorder_eval(const struct nft_expr *expr, for (i = 0; i < priv->len / 8; i++) { src64 = (__force __u64) cpu_to_be64(nft_reg_load64(&src[i])); - nft_reg_store64(&dst[i], src64); + nft_reg_store64(&dst64[i], src64); } break; } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index f7da7c43333b..ba0d3683a45d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -63,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key, { switch (key) { case NFT_META_TIME_NS: - nft_reg_store64(dest, ktime_get_real_ns()); + nft_reg_store64((u64 *)dest, ktime_get_real_ns()); break; case NFT_META_TIME_DAY: nft_reg_store8(dest, nft_meta_weekday()); From a7d5a955bfa854ac6b0c53aaf933394b4e6139e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Nov 2023 20:34:56 +0100 Subject: [PATCH 4/6] netfilter: nf_tables: bogus ENOENT when destroying element which does not exist destroy element command bogusly reports ENOENT in case a set element does not exist. ENOENT errors are skipped, however, err is still set and propagated to userspace. # nft destroy element ip raw BLACKLIST { 1.2.3.4 } Error: Could not process rule: No such file or directory destroy element ip raw BLACKLIST { 1.2.3.4 } ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Fixes: f80a612dd77c ("netfilter: nf_tables: add support to destroy operation") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a761ee6796f6..debea1c67701 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7263,10 +7263,11 @@ static int nf_tables_delsetelem(struct sk_buff *skb, if (err < 0) { NL_SET_BAD_ATTR(extack, attr); - break; + return err; } } - return err; + + return 0; } /* From 28628fa952fefc7f2072ce6e8016968cc452b1ba Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 13 Nov 2023 21:13:23 +0100 Subject: [PATCH 5/6] netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test Linkui Xiao reported that there's a race condition when ipset swap and destroy is called, which can lead to crash in add/del/test element operations. Swap then destroy are usual operations to replace a set with another one in a production system. The issue can in some cases be reproduced with the script: ipset create hash_ip1 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip1 172.20.0.0/16 ipset add hash_ip1 192.168.0.0/16 iptables -A INPUT -m set --match-set hash_ip1 src -j ACCEPT while [ 1 ] do # ... Ongoing traffic... ipset create hash_ip2 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip2 172.20.0.0/16 ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 sleep 0.05 done In the race case the possible order of the operations are CPU0 CPU1 ip_set_test ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 hash_net_kadt Swap replaces hash_ip1 with hash_ip2 and then destroy removes hash_ip2 which is the original hash_ip1. ip_set_test was called on hash_ip1 and because destroy removed it, hash_net_kadt crashes. The fix is to force ip_set_swap() to wait for all readers to finish accessing the old set pointers by calling synchronize_rcu(). The first version of the patch was written by Linkui Xiao . v2: synchronize_rcu() is moved into ip_set_swap() in order not to burden ip_set_destroy() unnecessarily when all sets are destroyed. v3: Florian Westphal pointed out that all netfilter hooks run with rcu_read_lock() held and em_ipset.c wraps the entire ip_set_test() in rcu read lock/unlock pair. So there's no need to extend the rcu read locked area in ipset itself. Closes: https://lore.kernel.org/all/69e7963b-e7f8-3ad0-210-7b86eebf7f78@netfilter.org/ Reported by: Linkui Xiao Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 35d2f9c9ada0..4c133e06be1d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -61,6 +61,8 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); ip_set_dereference((inst)->ip_set_list)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] +#define ip_set_dereference_nfnl(p) \ + rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -708,15 +710,10 @@ __ip_set_put_netlink(struct ip_set *set) static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { - struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); - rcu_read_lock(); - /* ip_set_list itself needs to be protected */ - set = rcu_dereference(inst->ip_set_list)[index]; - rcu_read_unlock(); - - return set; + /* ip_set_list and the set pointer need to be protected */ + return ip_set_dereference_nfnl(inst->ip_set_list)[index]; } static inline void @@ -1397,6 +1394,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); + /* Make sure all readers of the old set pointers are completed. */ + synchronize_rcu(); + return 0; } From 8837ba3e58ea1e3d09ae36db80b1e80853aada95 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Nov 2023 21:13:31 +0100 Subject: [PATCH 6/6] netfilter: nf_tables: split async and sync catchall in two functions list_for_each_entry_safe() does not work for the async case which runs under RCU, therefore, split GC logic for catchall in two functions instead, one for each of the sync and async GC variants. The catchall sync GC variant never sees a _DEAD bit set on ever, thus, this handling is removed in such case, moreover, allocate GC sync batch via GFP_KERNEL. Fixes: 93995bf4af2c ("netfilter: nf_tables: remove catchall element in GC sync path") Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 55 +++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index debea1c67701..c0a42989b982 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9680,16 +9680,14 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) call_rcu(&trans->rcu, nft_trans_gc_trans_free); } -static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, - unsigned int gc_seq, - bool sync) +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq) { - struct nft_set_elem_catchall *catchall, *next; + struct nft_set_elem_catchall *catchall; const struct nft_set *set = gc->set; - struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; - list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_expired(ext)) @@ -9699,35 +9697,42 @@ static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, nft_set_elem_dead(ext); dead_elem: - if (sync) - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); - else - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); - + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); if (!gc) return NULL; - elem_priv = catchall->elem; - if (sync) { - nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); - nft_setelem_catchall_destroy(catchall); - } - - nft_trans_gc_elem_add(gc, elem_priv); + nft_trans_gc_elem_add(gc, catchall->elem); } return gc; } -struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, - unsigned int gc_seq) -{ - return nft_trans_gc_catchall(gc, gc_seq, false); -} - struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) { - return nft_trans_gc_catchall(gc, 0, true); + struct nft_set_elem_catchall *catchall, *next; + const struct nft_set *set = gc->set; + struct nft_elem_priv *elem_priv; + struct nft_set_ext *ext; + + WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + return NULL; + + elem_priv = catchall->elem; + nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); + nft_setelem_catchall_destroy(catchall); + nft_trans_gc_elem_add(gc, elem_priv); + } + + return gc; } static void nf_tables_module_autoload_cleanup(struct net *net)