linux/drivers/net/veth.c

1980 lines
46 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* drivers/net/veth.c
*
* Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
*
* Author: Pavel Emelianov <xemul@openvz.org>
* Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
*
*/
#include <linux/netdevice.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
#include <linux/slab.h>
#include <linux/ethtool.h>
#include <linux/etherdevice.h>
#include <linux/u64_stats_sync.h>
#include <net/rtnetlink.h>
#include <net/dst.h>
#include <net/xfrm.h>
#include <net/xdp.h>
#include <linux/veth.h>
#include <linux/module.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ptr_ring.h>
#include <linux/bpf_trace.h>
#include <linux/net_tstamp.h>
#include <linux/skbuff_ref.h>
#include <net/page_pool/helpers.h>
#define DRV_NAME "veth"
#define DRV_VERSION "1.0"
#define VETH_XDP_FLAG BIT(0)
#define VETH_RING_SIZE 256
#define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
#define VETH_XDP_TX_BULK_SIZE 16
#define VETH_XDP_BATCH 16
struct veth_stats {
u64 rx_drops;
/* xdp */
u64 xdp_packets;
u64 xdp_bytes;
u64 xdp_redirect;
u64 xdp_drops;
u64 xdp_tx;
u64 xdp_tx_err;
u64 peer_tq_xdp_xmit;
u64 peer_tq_xdp_xmit_err;
};
struct veth_rq_stats {
struct veth_stats vs;
struct u64_stats_sync syncp;
};
struct veth_rq {
struct napi_struct xdp_napi;
struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
struct net_device *dev;
struct bpf_prog __rcu *xdp_prog;
struct xdp_mem_info xdp_mem;
struct veth_rq_stats stats;
bool rx_notify_masked;
struct ptr_ring xdp_ring;
struct xdp_rxq_info xdp_rxq;
struct page_pool *page_pool;
};
struct veth_priv {
struct net_device __rcu *peer;
atomic64_t dropped;
struct bpf_prog *_xdp_prog;
struct veth_rq *rq;
unsigned int requested_headroom;
};
struct veth_xdp_tx_bq {
struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
unsigned int count;
};
/*
* ethtool interface
*/
struct veth_q_stat_desc {
char desc[ETH_GSTRING_LEN];
size_t offset;
};
#define VETH_RQ_STAT(m) offsetof(struct veth_stats, m)
static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
{ "xdp_packets", VETH_RQ_STAT(xdp_packets) },
{ "xdp_bytes", VETH_RQ_STAT(xdp_bytes) },
{ "drops", VETH_RQ_STAT(rx_drops) },
{ "xdp_redirect", VETH_RQ_STAT(xdp_redirect) },
{ "xdp_drops", VETH_RQ_STAT(xdp_drops) },
{ "xdp_tx", VETH_RQ_STAT(xdp_tx) },
{ "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) },
};
#define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc)
static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
{ "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) },
{ "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) },
};
#define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc)
static struct {
const char string[ETH_GSTRING_LEN];
} ethtool_stats_keys[] = {
{ "peer_ifindex" },
};
struct veth_xdp_buff {
struct xdp_buff xdp;
struct sk_buff *skb;
};
static int veth_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *cmd)
{
cmd->base.speed = SPEED_10000;
cmd->base.duplex = DUPLEX_FULL;
cmd->base.port = PORT_TP;
cmd->base.autoneg = AUTONEG_DISABLE;
return 0;
}
static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
strscpy(info->driver, DRV_NAME, sizeof(info->driver));
strscpy(info->version, DRV_VERSION, sizeof(info->version));
}
static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
{
u8 *p = buf;
int i, j;
switch(stringset) {
case ETH_SS_STATS:
memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
p += sizeof(ethtool_stats_keys);
for (i = 0; i < dev->real_num_rx_queues; i++)
for (j = 0; j < VETH_RQ_STATS_LEN; j++)
ethtool_sprintf(&p, "rx_queue_%u_%.18s",
i, veth_rq_stats_desc[j].desc);
for (i = 0; i < dev->real_num_tx_queues; i++)
for (j = 0; j < VETH_TQ_STATS_LEN; j++)
ethtool_sprintf(&p, "tx_queue_%u_%.18s",
i, veth_tq_stats_desc[j].desc);
page_pool_ethtool_stats_get_strings(p);
break;
}
}
static int veth_get_sset_count(struct net_device *dev, int sset)
{
switch (sset) {
case ETH_SS_STATS:
return ARRAY_SIZE(ethtool_stats_keys) +
VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
VETH_TQ_STATS_LEN * dev->real_num_tx_queues +
page_pool_ethtool_stats_get_count();
default:
return -EOPNOTSUPP;
}
}
static void veth_get_page_pool_stats(struct net_device *dev, u64 *data)
{
#ifdef CONFIG_PAGE_POOL_STATS
struct veth_priv *priv = netdev_priv(dev);
struct page_pool_stats pp_stats = {};
int i;
for (i = 0; i < dev->real_num_rx_queues; i++) {
if (!priv->rq[i].page_pool)
continue;
page_pool_get_stats(priv->rq[i].page_pool, &pp_stats);
}
page_pool_ethtool_stats_get(data, &pp_stats);
#endif /* CONFIG_PAGE_POOL_STATS */
}
static void veth_get_ethtool_stats(struct net_device *dev,
struct ethtool_stats *stats, u64 *data)
{
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
struct net_device *peer = rtnl_dereference(priv->peer);
int i, j, idx, pp_idx;
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
data[0] = peer ? peer->ifindex : 0;
idx = 1;
for (i = 0; i < dev->real_num_rx_queues; i++) {
const struct veth_rq_stats *rq_stats = &priv->rq[i].stats;
const void *stats_base = (void *)&rq_stats->vs;
unsigned int start;
size_t offset;
do {
start = u64_stats_fetch_begin(&rq_stats->syncp);
for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
offset = veth_rq_stats_desc[j].offset;
data[idx + j] = *(u64 *)(stats_base + offset);
}
} while (u64_stats_fetch_retry(&rq_stats->syncp, start));
idx += VETH_RQ_STATS_LEN;
}
pp_idx = idx;
if (!peer)
goto page_pool_stats;
rcv_priv = netdev_priv(peer);
for (i = 0; i < peer->real_num_rx_queues; i++) {
const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
const void *base = (void *)&rq_stats->vs;
unsigned int start, tx_idx = idx;
size_t offset;
tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
do {
start = u64_stats_fetch_begin(&rq_stats->syncp);
for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
offset = veth_tq_stats_desc[j].offset;
data[tx_idx + j] += *(u64 *)(base + offset);
}
} while (u64_stats_fetch_retry(&rq_stats->syncp, start));
}
pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN;
page_pool_stats:
veth_get_page_pool_stats(dev, &data[pp_idx]);
}
static void veth_get_channels(struct net_device *dev,
struct ethtool_channels *channels)
{
channels->tx_count = dev->real_num_tx_queues;
channels->rx_count = dev->real_num_rx_queues;
channels->max_tx = dev->num_tx_queues;
channels->max_rx = dev->num_rx_queues;
}
static int veth_set_channels(struct net_device *dev,
struct ethtool_channels *ch);
static const struct ethtool_ops veth_ethtool_ops = {
.get_drvinfo = veth_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_strings = veth_get_strings,
.get_sset_count = veth_get_sset_count,
.get_ethtool_stats = veth_get_ethtool_stats,
.get_link_ksettings = veth_get_link_ksettings,
.get_ts_info = ethtool_op_get_ts_info,
.get_channels = veth_get_channels,
.set_channels = veth_set_channels,
};
/* general routines */
static bool veth_is_xdp_frame(void *ptr)
{
return (unsigned long)ptr & VETH_XDP_FLAG;
}
static struct xdp_frame *veth_ptr_to_xdp(void *ptr)
{
return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
}
static void *veth_xdp_to_ptr(struct xdp_frame *xdp)
{
return (void *)((unsigned long)xdp | VETH_XDP_FLAG);
}
static void veth_ptr_free(void *ptr)
{
if (veth_is_xdp_frame(ptr))
xdp_return_frame(veth_ptr_to_xdp(ptr));
else
kfree_skb(ptr);
}
static void __veth_xdp_flush(struct veth_rq *rq)
{
/* Write ptr_ring before reading rx_notify_masked */
smp_mb();
veth: fix races around rq->rx_notify_masked veth being NETIF_F_LLTX enabled, we need to be more careful whenever we read/write rq->rx_notify_masked. BUG: KCSAN: data-race in veth_xmit / veth_xmit write to 0xffff888133d9a9f8 of 1 bytes by task 23552 on cpu 0: __veth_xdp_flush drivers/net/veth.c:269 [inline] veth_xmit+0x307/0x470 drivers/net/veth.c:350 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 br_dev_queue_push_xmit+0x3ce/0x430 net/bridge/br_forward.c:53 NF_HOOK include/linux/netfilter.h:307 [inline] br_forward_finish net/bridge/br_forward.c:66 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] __br_forward+0x2e4/0x400 net/bridge/br_forward.c:115 br_flood+0x521/0x5c0 net/bridge/br_forward.c:242 br_dev_xmit+0x8b6/0x960 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 neigh_hh_output include/net/neighbour.h:525 [inline] neigh_output include/net/neighbour.h:539 [inline] ip_finish_output2+0x6f8/0xb70 net/ipv4/ip_output.c:228 ip_finish_output+0xfb/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:451 [inline] ip_local_out net/ipv4/ip_output.c:126 [inline] ip_send_skb+0x6e/0xe0 net/ipv4/ip_output.c:1570 udp_send_skb+0x641/0x880 net/ipv4/udp.c:967 udp_sendmsg+0x12ea/0x14c0 net/ipv4/udp.c:1254 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888133d9a9f8 of 1 bytes by task 23563 on cpu 1: __veth_xdp_flush drivers/net/veth.c:268 [inline] veth_xmit+0x2d6/0x470 drivers/net/veth.c:350 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 br_dev_queue_push_xmit+0x3ce/0x430 net/bridge/br_forward.c:53 NF_HOOK include/linux/netfilter.h:307 [inline] br_forward_finish net/bridge/br_forward.c:66 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] __br_forward+0x2e4/0x400 net/bridge/br_forward.c:115 br_flood+0x521/0x5c0 net/bridge/br_forward.c:242 br_dev_xmit+0x8b6/0x960 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 neigh_hh_output include/net/neighbour.h:525 [inline] neigh_output include/net/neighbour.h:539 [inline] ip_finish_output2+0x6f8/0xb70 net/ipv4/ip_output.c:228 ip_finish_output+0xfb/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:451 [inline] ip_local_out net/ipv4/ip_output.c:126 [inline] ip_send_skb+0x6e/0xe0 net/ipv4/ip_output.c:1570 udp_send_skb+0x641/0x880 net/ipv4/udp.c:967 udp_sendmsg+0x12ea/0x14c0 net/ipv4/udp.c:1254 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00 -> 0x01 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 23563 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00064-gc36c04c2e132 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 948d4f214fde ("veth: Add driver XDP") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-08 23:28:22 +00:00
if (!READ_ONCE(rq->rx_notify_masked) &&
napi_schedule_prep(&rq->xdp_napi)) {
WRITE_ONCE(rq->rx_notify_masked, true);
__napi_schedule(&rq->xdp_napi);
}
}
static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
{
if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
dev_kfree_skb_any(skb);
return NET_RX_DROP;
}
return NET_RX_SUCCESS;
}
static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
struct veth_rq *rq, bool xdp)
{
return __dev_forward_skb(dev, skb) ?: xdp ?
veth_xdp_rx(rq, skb) :
net: dev: Makes sure netif_rx() can be invoked in any context. Dave suggested a while ago (eleven years by now) "Let's make netif_rx() work in all contexts and get rid of netif_rx_ni()". Eric agreed and pointed out that modern devices should use netif_receive_skb() to avoid the overhead. In the meantime someone added another variant, netif_rx_any_context(), which behaves as suggested. netif_rx() must be invoked with disabled bottom halves to ensure that pending softirqs, which were raised within the function, are handled. netif_rx_ni() can be invoked only from process context (bottom halves must be enabled) because the function handles pending softirqs without checking if bottom halves were disabled or not. netif_rx_any_context() invokes on the former functions by checking in_interrupts(). netif_rx() could be taught to handle both cases (disabled and enabled bottom halves) by simply disabling bottom halves while invoking netif_rx_internal(). The local_bh_enable() invocation will then invoke pending softirqs only if the BH-disable counter drops to zero. Eric is concerned about the overhead of BH-disable+enable especially in regard to the loopback driver. As critical as this driver is, it will receive a shortcut to avoid the additional overhead which is not needed. Add a local_bh_disable() section in netif_rx() to ensure softirqs are handled if needed. Provide __netif_rx() which does not disable BH and has a lockdep assert to ensure that interrupts are disabled. Use this shortcut in the loopback driver and in drivers/net/*.c. Make netif_rx_ni() and netif_rx_any_context() invoke netif_rx() so they can be removed once they are no more users left. Link: https://lkml.kernel.org/r/20100415.020246.218622820.davem@davemloft.net Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Reviewed-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-11 23:38:38 +00:00
__netif_rx(skb);
}
/* return true if the specified skb has chances of GRO aggregation
* Don't strive for accuracy, but try to avoid GRO overhead in the most
* common scenarios.
* When XDP is enabled, all traffic is considered eligible, as the xmit
* device has TSO off.
* When TSO is enabled on the xmit device, we are likely interested only
* in UDP aggregation, explicitly check for that if the skb is suspected
* - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
* to belong to locally generated UDP traffic.
*/
static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
const struct net_device *rcv,
const struct sk_buff *skb)
{
return !(dev->features & NETIF_F_ALL_TSO) ||
(skb->destructor == sock_wfree &&
rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
}
static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
struct veth_rq *rq = NULL;
int ret = NETDEV_TX_OK;
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
struct net_device *rcv;
int length = skb->len;
bool use_napi = false;
int rxq;
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
rcu_read_lock();
rcv = rcu_dereference(priv->peer);
2022-04-06 14:18:54 +00:00
if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) {
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
kfree_skb(skb);
goto drop;
}
rcv_priv = netdev_priv(rcv);
rxq = skb_get_queue_mapping(skb);
if (rxq < rcv->real_num_rx_queues) {
rq = &rcv_priv->rq[rxq];
/* The napi pointer is available when an XDP program is
* attached or when GRO is enabled
* Don't bother with napi/GRO if the skb can't be aggregated
*/
use_napi = rcu_access_pointer(rq->napi) &&
veth_skb_is_eligible_for_gro(dev, rcv, skb);
}
skb_tx_timestamp(skb);
if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
if (!use_napi)
2023-11-14 00:42:16 +00:00
dev_sw_netstats_tx_add(dev, 1, length);
else
__veth_xdp_flush(rq);
} else {
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
drop:
atomic64_inc(&priv->dropped);
ret = NET_XMIT_DROP;
}
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
rcu_read_unlock();
return ret;
}
static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
int i;
result->peer_tq_xdp_xmit_err = 0;
result->xdp_packets = 0;
result->xdp_tx_err = 0;
result->xdp_bytes = 0;
result->rx_drops = 0;
for (i = 0; i < dev->num_rx_queues; i++) {
u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err;
struct veth_rq_stats *stats = &priv->rq[i].stats;
unsigned int start;
do {
start = u64_stats_fetch_begin(&stats->syncp);
peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err;
xdp_tx_err = stats->vs.xdp_tx_err;
packets = stats->vs.xdp_packets;
bytes = stats->vs.xdp_bytes;
drops = stats->vs.rx_drops;
} while (u64_stats_fetch_retry(&stats->syncp, start));
result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err;
result->xdp_tx_err += xdp_tx_err;
result->xdp_packets += packets;
result->xdp_bytes += bytes;
result->rx_drops += drops;
}
}
static void veth_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *tot)
{
struct veth_priv *priv = netdev_priv(dev);
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
struct net_device *peer;
struct veth_stats rx;
2023-11-14 00:42:16 +00:00
tot->tx_dropped = atomic64_read(&priv->dropped);
dev_fetch_sw_netstats(tot, dev->tstats);
veth_stats_rx(&rx, dev);
tot->tx_dropped += rx.xdp_tx_err;
tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
2023-11-14 00:42:16 +00:00
tot->rx_bytes += rx.xdp_bytes;
tot->rx_packets += rx.xdp_packets;
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
rcu_read_lock();
peer = rcu_dereference(priv->peer);
if (peer) {
2023-11-14 00:42:16 +00:00
struct rtnl_link_stats64 tot_peer = {};
dev_fetch_sw_netstats(&tot_peer, peer->tstats);
tot->rx_bytes += tot_peer.tx_bytes;
tot->rx_packets += tot_peer.tx_packets;
veth_stats_rx(&rx, peer);
tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
tot->rx_dropped += rx.xdp_tx_err;
tot->tx_bytes += rx.xdp_bytes;
tot->tx_packets += rx.xdp_packets;
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
}
rcu_read_unlock();
}
/* fake multicast ability */
static void veth_set_multicast_list(struct net_device *dev)
{
}
static int veth_select_rxq(struct net_device *dev)
{
return smp_processor_id() % dev->real_num_rx_queues;
}
bpf: Add redirect_peer helper Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net
2020-10-10 23:40:02 +00:00
static struct net_device *veth_peer_dev(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
/* Callers must be under RCU read side. */
return rcu_dereference(priv->peer);
}
static int veth_xdp_xmit(struct net_device *dev, int n,
struct xdp_frame **frames,
u32 flags, bool ndo_xmit)
{
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
int i, ret = -ENXIO, nxmit = 0;
struct net_device *rcv;
unsigned int max_len;
struct veth_rq *rq;
if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
rcu_read_lock();
rcv = rcu_dereference(priv->peer);
if (unlikely(!rcv))
goto out;
rcv_priv = netdev_priv(rcv);
rq = &rcv_priv->rq[veth_select_rxq(rcv)];
/* The napi pointer is set if NAPI is enabled, which ensures that
* xdp_ring is initialized on receive side and the peer device is up.
*/
if (!rcu_access_pointer(rq->napi))
goto out;
max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
spin_lock(&rq->xdp_ring.producer_lock);
for (i = 0; i < n; i++) {
struct xdp_frame *frame = frames[i];
void *ptr = veth_xdp_to_ptr(frame);
if (unlikely(xdp_get_frame_len(frame) > max_len ||
__ptr_ring_produce(&rq->xdp_ring, ptr)))
break;
nxmit++;
}
spin_unlock(&rq->xdp_ring.producer_lock);
if (flags & XDP_XMIT_FLUSH)
__veth_xdp_flush(rq);
ret = nxmit;
if (ndo_xmit) {
u64_stats_update_begin(&rq->stats.syncp);
rq->stats.vs.peer_tq_xdp_xmit += nxmit;
rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit;
u64_stats_update_end(&rq->stats.syncp);
}
out:
bpf, xdp: Remove no longer required rcu_read_{un}lock() Now that we depend on rcu_call() and synchronize_rcu() to also wait for preempt_disabled region to complete the rcu read critical section in __dev_map_flush() is no longer required. Except in a few special cases in drivers that need it for other reasons. These originally ensured the map reference was safe while a map was also being free'd. And additionally that bpf program updates via ndo_bpf did not happen while flush updates were in flight. But flush by new rules can only be called from preempt-disabled NAPI context. The synchronize_rcu from the map free path and the rcu_call from the delete path will ensure the reference there is safe. So lets remove the rcu_read_lock and rcu_read_unlock pair to avoid any confusion around how this is being protected. If the rcu_read_lock was required it would mean errors in the above logic and the original patch would also be wrong. Now that we have done above we put the rcu_read_lock in the driver code where it is needed in a driver dependent way. I think this helps readability of the code so we know where and why we are taking read locks. Most drivers will not need rcu_read_locks here and further XDP drivers already have rcu_read_locks in their code paths for reading xdp programs on RX side so this makes it symmetric where we don't have half of rcu critical sections define in driver and the other half in devmap. Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com> Link: https://lore.kernel.org/bpf/1580084042-11598-4-git-send-email-john.fastabend@gmail.com
2020-01-27 00:14:02 +00:00
rcu_read_unlock();
return ret;
}
static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
struct xdp_frame **frames, u32 flags)
{
int err;
err = veth_xdp_xmit(dev, n, frames, flags, true);
if (err < 0) {
struct veth_priv *priv = netdev_priv(dev);
atomic64_add(n, &priv->dropped);
}
return err;
}
static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
{
int sent, i, err = 0, drops;
sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false);
if (sent < 0) {
err = sent;
sent = 0;
}
for (i = sent; unlikely(i < bq->count); i++)
xdp_return_frame(bq->q[i]);
drops = bq->count - sent;
trace_xdp_bulk_tx(rq->dev, sent, drops, err);
u64_stats_update_begin(&rq->stats.syncp);
rq->stats.vs.xdp_tx += sent;
rq->stats.vs.xdp_tx_err += drops;
u64_stats_update_end(&rq->stats.syncp);
bq->count = 0;
}
static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
{
struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev);
struct net_device *rcv;
struct veth_rq *rcv_rq;
rcu_read_lock();
veth_xdp_flush_bq(rq, bq);
rcv = rcu_dereference(priv->peer);
if (unlikely(!rcv))
goto out;
rcv_priv = netdev_priv(rcv);
rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)];
/* xdp_ring is initialized on receive side? */
if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog)))
goto out;
__veth_xdp_flush(rcv_rq);
out:
rcu_read_unlock();
}
static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
struct veth_xdp_tx_bq *bq)
{
struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
if (unlikely(!frame))
return -EOVERFLOW;
if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
veth_xdp_flush_bq(rq, bq);
bq->q[bq->count++] = frame;
return 0;
}
static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
struct xdp_frame *frame,
struct veth_xdp_tx_bq *bq,
struct veth_stats *stats)
{
struct xdp_frame orig_frame;
struct bpf_prog *xdp_prog;
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (likely(xdp_prog)) {
struct veth_xdp_buff vxbuf;
struct xdp_buff *xdp = &vxbuf.xdp;
u32 act;
xdp_convert_frame_to_buff(frame, xdp);
xdp->rxq = &rq->xdp_rxq;
vxbuf.skb = NULL;
act = bpf_prog_run_xdp(xdp_prog, xdp);
switch (act) {
case XDP_PASS:
if (xdp_update_frame_from_buff(xdp, frame))
goto err_xdp;
break;
case XDP_TX:
orig_frame = *frame;
xdp->rxq->mem = frame->mem;
if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
trace_xdp_exception(rq->dev, xdp_prog, act);
frame = &orig_frame;
stats->rx_drops++;
goto err_xdp;
}
stats->xdp_tx++;
rcu_read_unlock();
goto xdp_xmit;
case XDP_REDIRECT:
orig_frame = *frame;
xdp->rxq->mem = frame->mem;
if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
frame = &orig_frame;
stats->rx_drops++;
goto err_xdp;
}
stats->xdp_redirect++;
rcu_read_unlock();
goto xdp_xmit;
default:
bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(rq->dev, xdp_prog, act);
fallthrough;
case XDP_DROP:
stats->xdp_drops++;
goto err_xdp;
}
}
rcu_read_unlock();
return frame;
err_xdp:
rcu_read_unlock();
xdp_return_frame(frame);
xdp_xmit:
return NULL;
}
/* frames array contains VETH_XDP_BATCH at most */
static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
int n_xdpf, struct veth_xdp_tx_bq *bq,
struct veth_stats *stats)
{
void *skbs[VETH_XDP_BATCH];
int i;
if (xdp_alloc_skb_bulk(skbs, n_xdpf,
GFP_ATOMIC | __GFP_ZERO) < 0) {
for (i = 0; i < n_xdpf; i++)
xdp_return_frame(frames[i]);
stats->rx_drops += n_xdpf;
return;
}
for (i = 0; i < n_xdpf; i++) {
struct sk_buff *skb = skbs[i];
skb = __xdp_build_skb_from_frame(frames[i], skb,
rq->dev);
if (!skb) {
xdp_return_frame(frames[i]);
stats->rx_drops++;
continue;
}
napi_gro_receive(&rq->xdp_napi, skb);
}
}
static void veth_xdp_get(struct xdp_buff *xdp)
{
struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
int i;
get_page(virt_to_page(xdp->data));
if (likely(!xdp_buff_has_frags(xdp)))
return;
veth: Orphan skb before GRO GRO expects skbs not to be owned by sockets, but when XDP is enabled veth passed skbs owned by sockets. It caused corrupted sk_wmem_alloc. Paolo Abeni reported the following splat: [ 362.098904] refcount_t overflow at skb_set_owner_w+0x5e/0xa0 in iperf3[1644], uid/euid: 0/0 [ 362.108239] WARNING: CPU: 0 PID: 1644 at kernel/panic.c:648 refcount_error_report+0xa0/0xa4 [ 362.117547] Modules linked in: tcp_diag inet_diag veth intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate intel_uncore intel_rapl_perf ipmi_ssif iTCO_wdt sg ipmi_si iTCO_vendor_support ipmi_devintf mxm_wmi ipmi_msghandler pcspkr dcdbas mei_me wmi mei lpc_ich acpi_power_meter pcc_cpufreq xfs libcrc32c sd_mod mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ixgbe igb ttm ahci mdio libahci ptp crc32c_intel drm pps_core libata i2c_algo_bit dca dm_mirror dm_region_hash dm_log dm_mod [ 362.176622] CPU: 0 PID: 1644 Comm: iperf3 Not tainted 4.19.0-rc2.vanilla+ #2025 [ 362.184777] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.1.7 06/16/2016 [ 362.193124] RIP: 0010:refcount_error_report+0xa0/0xa4 [ 362.198758] Code: 08 00 00 48 8b 95 80 00 00 00 49 8d 8c 24 80 0a 00 00 41 89 c1 44 89 2c 24 48 89 de 48 c7 c7 18 4d e7 9d 31 c0 e8 30 fa ff ff <0f> 0b eb 88 0f 1f 44 00 00 55 48 89 e5 41 56 41 55 41 54 49 89 fc [ 362.219711] RSP: 0018:ffff9ee6ff603c20 EFLAGS: 00010282 [ 362.225538] RAX: 0000000000000000 RBX: ffffffff9de83e10 RCX: 0000000000000000 [ 362.233497] RDX: 0000000000000001 RSI: ffff9ee6ff6167d8 RDI: ffff9ee6ff6167d8 [ 362.241457] RBP: ffff9ee6ff603d78 R08: 0000000000000490 R09: 0000000000000004 [ 362.249416] R10: 0000000000000000 R11: ffff9ee6ff603990 R12: ffff9ee664b94500 [ 362.257377] R13: 0000000000000000 R14: 0000000000000004 R15: ffffffff9de615f9 [ 362.265337] FS: 00007f1d22d28740(0000) GS:ffff9ee6ff600000(0000) knlGS:0000000000000000 [ 362.274363] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 362.280773] CR2: 00007f1d222f35d0 CR3: 0000001fddfec003 CR4: 00000000001606f0 [ 362.288733] Call Trace: [ 362.291459] <IRQ> [ 362.293702] ex_handler_refcount+0x4e/0x80 [ 362.298269] fixup_exception+0x35/0x40 [ 362.302451] do_trap+0x109/0x150 [ 362.306048] do_error_trap+0xd5/0x130 [ 362.315766] invalid_op+0x14/0x20 [ 362.319460] RIP: 0010:skb_set_owner_w+0x5e/0xa0 [ 362.324512] Code: ef ff ff 74 49 48 c7 43 60 20 7b 4a 9d 8b 85 f4 01 00 00 85 c0 75 16 8b 83 e0 00 00 00 f0 01 85 44 01 00 00 0f 88 d8 23 16 00 <5b> 5d c3 80 8b 91 00 00 00 01 8b 85 f4 01 00 00 89 83 a4 00 00 00 [ 362.345465] RSP: 0018:ffff9ee6ff603e20 EFLAGS: 00010a86 [ 362.351291] RAX: 0000000000001100 RBX: ffff9ee65deec700 RCX: ffff9ee65e829244 [ 362.359250] RDX: 0000000000000100 RSI: ffff9ee65e829100 RDI: ffff9ee65deec700 [ 362.367210] RBP: ffff9ee65e829100 R08: 000000000002a380 R09: 0000000000000000 [ 362.375169] R10: 0000000000000002 R11: fffff1a4bf77bb00 R12: ffffc0754661d000 [ 362.383130] R13: ffff9ee65deec200 R14: ffff9ee65f597000 R15: 00000000000000aa [ 362.391092] veth_xdp_rcv+0x4e4/0x890 [veth] [ 362.399357] veth_poll+0x4d/0x17a [veth] [ 362.403731] net_rx_action+0x2af/0x3f0 [ 362.407912] __do_softirq+0xdd/0x29e [ 362.411897] do_softirq_own_stack+0x2a/0x40 [ 362.416561] </IRQ> [ 362.418899] do_softirq+0x4b/0x70 [ 362.422594] __local_bh_enable_ip+0x50/0x60 [ 362.427258] ip_finish_output2+0x16a/0x390 [ 362.431824] ip_output+0x71/0xe0 [ 362.440670] __tcp_transmit_skb+0x583/0xab0 [ 362.445333] tcp_write_xmit+0x247/0xfb0 [ 362.449609] __tcp_push_pending_frames+0x2d/0xd0 [ 362.454760] tcp_sendmsg_locked+0x857/0xd30 [ 362.459424] tcp_sendmsg+0x27/0x40 [ 362.463216] sock_sendmsg+0x36/0x50 [ 362.467104] sock_write_iter+0x87/0x100 [ 362.471382] __vfs_write+0x112/0x1a0 [ 362.475369] vfs_write+0xad/0x1a0 [ 362.479062] ksys_write+0x52/0xc0 [ 362.482759] do_syscall_64+0x5b/0x180 [ 362.486841] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 362.492473] RIP: 0033:0x7f1d22293238 [ 362.496458] Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 c5 54 2d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55 [ 362.517409] RSP: 002b:00007ffebaef8008 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 362.525855] RAX: ffffffffffffffda RBX: 0000000000002800 RCX: 00007f1d22293238 [ 362.533816] RDX: 0000000000002800 RSI: 00007f1d22d36000 RDI: 0000000000000005 [ 362.541775] RBP: 00007f1d22d36000 R08: 00000002db777a30 R09: 0000562b70712b20 [ 362.549734] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000005 [ 362.557693] R13: 0000000000002800 R14: 00007ffebaef8060 R15: 0000562b70712260 In order to avoid this, orphan the skb before entering GRO. Fixes: 948d4f214fde ("veth: Add driver XDP") Reported-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Tested-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-09-14 04:33:44 +00:00
for (i = 0; i < sinfo->nr_frags; i++)
__skb_frag_ref(&sinfo->frags[i]);
}
static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
struct xdp_buff *xdp,
struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
u32 frame_sz;
if (skb_shared(skb) || skb_head_is_locked(skb) ||
veth: Fix use after free in XDP_REDIRECT Commit 718a18a0c8a6 ("veth: Rework veth_xdp_rcv_skb in order to accept non-linear skb") introduced a bug where it tried to use pskb_expand_head() if the headroom was less than XDP_PACKET_HEADROOM. This however uses kmalloc to expand the head, which will later allow consume_skb() to free the skb while is it still in use by AF_XDP. Previously if the headroom was less than XDP_PACKET_HEADROOM we continued on to allocate a new skb from pages so this restores that behavior. BUG: KASAN: use-after-free in __xsk_rcv+0x18d/0x2c0 Read of size 78 at addr ffff888976250154 by task napi/iconduit-g/148640 CPU: 5 PID: 148640 Comm: napi/iconduit-g Kdump: loaded Tainted: G O 6.1.4-cloudflare-kasan-2023.1.2 #1 Hardware name: Quanta Computer Inc. QuantaPlex T41S-2U/S2S-MB, BIOS S2S_3B10.03 06/21/2018 Call Trace: <TASK> dump_stack_lvl+0x34/0x48 print_report+0x170/0x473 ? __xsk_rcv+0x18d/0x2c0 kasan_report+0xad/0x130 ? __xsk_rcv+0x18d/0x2c0 kasan_check_range+0x149/0x1a0 memcpy+0x20/0x60 __xsk_rcv+0x18d/0x2c0 __xsk_map_redirect+0x1f3/0x490 ? veth_xdp_rcv_skb+0x89c/0x1ba0 [veth] xdp_do_redirect+0x5ca/0xd60 veth_xdp_rcv_skb+0x935/0x1ba0 [veth] ? __netif_receive_skb_list_core+0x671/0x920 ? veth_xdp+0x670/0x670 [veth] veth_xdp_rcv+0x304/0xa20 [veth] ? do_xdp_generic+0x150/0x150 ? veth_xdp_rcv_one+0xde0/0xde0 [veth] ? _raw_spin_lock_bh+0xe0/0xe0 ? newidle_balance+0x887/0xe30 ? __perf_event_task_sched_in+0xdb/0x800 veth_poll+0x139/0x571 [veth] ? veth_xdp_rcv+0xa20/0xa20 [veth] ? _raw_spin_unlock+0x39/0x70 ? finish_task_switch.isra.0+0x17e/0x7d0 ? __switch_to+0x5cf/0x1070 ? __schedule+0x95b/0x2640 ? io_schedule_timeout+0x160/0x160 __napi_poll+0xa1/0x440 napi_threaded_poll+0x3d1/0x460 ? __napi_poll+0x440/0x440 ? __kthread_parkme+0xc6/0x1f0 ? __napi_poll+0x440/0x440 kthread+0x2a2/0x340 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> Freed by task 148640: kasan_save_stack+0x23/0x50 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 ____kasan_slab_free+0x169/0x1d0 slab_free_freelist_hook+0xd2/0x190 __kmem_cache_free+0x1a1/0x2f0 skb_release_data+0x449/0x600 consume_skb+0x9f/0x1c0 veth_xdp_rcv_skb+0x89c/0x1ba0 [veth] veth_xdp_rcv+0x304/0xa20 [veth] veth_poll+0x139/0x571 [veth] __napi_poll+0xa1/0x440 napi_threaded_poll+0x3d1/0x460 kthread+0x2a2/0x340 ret_from_fork+0x22/0x30 The buggy address belongs to the object at ffff888976250000 which belongs to the cache kmalloc-2k of size 2048 The buggy address is located 340 bytes inside of 2048-byte region [ffff888976250000, ffff888976250800) The buggy address belongs to the physical page: page:00000000ae18262a refcount:2 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x976250 head:00000000ae18262a order:3 compound_mapcount:0 compound_pincount:0 flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff) raw: 002ffff800010200 0000000000000000 dead000000000122 ffff88810004cf00 raw: 0000000000000000 0000000080080008 00000002ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888976250000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888976250080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > ffff888976250100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888976250180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888976250200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 718a18a0c8a6 ("veth: Rework veth_xdp_rcv_skb in order to accept non-linear skb") Signed-off-by: Shawn Bohrer <sbohrer@cloudflare.com> Acked-by: Lorenzo Bianconi <lorenzo@kernel.org> Acked-by: Toshiaki Makita <toshiaki.makita1@gmail.com> Acked-by: Toke Høiland-Jørgensen <toke@kernel.org> Link: https://lore.kernel.org/r/20230314153351.2201328-1-sbohrer@cloudflare.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-03-14 15:33:51 +00:00
skb_shinfo(skb)->nr_frags ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM))
goto drop;
skb = *pskb;
}
/* SKB "head" area always have tailroom for skb_shared_info */
frame_sz = skb_end_pointer(skb) - skb->head;
frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
xdp_prepare_buff(xdp, skb->head, skb_headroom(skb),
skb_headlen(skb), true);
if (skb_is_nonlinear(skb)) {
skb_shinfo(skb)->xdp_frags_size = skb->data_len;
xdp_buff_set_frags_flag(xdp);
} else {
xdp_buff_clear_frags_flag(xdp);
}
*pskb = skb;
return 0;
drop:
consume_skb(skb);
*pskb = NULL;
return -ENOMEM;
}
static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
struct sk_buff *skb,
struct veth_xdp_tx_bq *bq,
struct veth_stats *stats)
{
void *orig_data, *orig_data_end;
struct bpf_prog *xdp_prog;
struct veth_xdp_buff vxbuf;
struct xdp_buff *xdp = &vxbuf.xdp;
u32 act, metalen;
int off;
skb_prepare_for_gro(skb);
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (unlikely(!xdp_prog)) {
rcu_read_unlock();
goto out;
}
__skb_push(skb, skb->data - skb_mac_header(skb));
if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb))
goto drop;
vxbuf.skb = skb;
orig_data = xdp->data;
orig_data_end = xdp->data_end;
act = bpf_prog_run_xdp(xdp_prog, xdp);
switch (act) {
case XDP_PASS:
break;
case XDP_TX:
veth_xdp_get(xdp);
consume_skb(skb);
xdp->rxq->mem = rq->xdp_mem;
if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
trace_xdp_exception(rq->dev, xdp_prog, act);
stats->rx_drops++;
goto err_xdp;
}
stats->xdp_tx++;
rcu_read_unlock();
goto xdp_xmit;
case XDP_REDIRECT:
veth_xdp_get(xdp);
consume_skb(skb);
xdp->rxq->mem = rq->xdp_mem;
if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
stats->rx_drops++;
goto err_xdp;
}
stats->xdp_redirect++;
rcu_read_unlock();
goto xdp_xmit;
default:
bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(rq->dev, xdp_prog, act);
fallthrough;
case XDP_DROP:
stats->xdp_drops++;
goto xdp_drop;
}
rcu_read_unlock();
/* check if bpf_xdp_adjust_head was used */
off = orig_data - xdp->data;
if (off > 0)
__skb_push(skb, off);
else if (off < 0)
__skb_pull(skb, -off);
skb_reset_mac_header(skb);
/* check if bpf_xdp_adjust_tail was used */
off = xdp->data_end - orig_data_end;
if (off != 0)
__skb_put(skb, off); /* positive on grow, negative on shrink */
/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
* (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
*/
if (xdp_buff_has_frags(xdp))
skb->data_len = skb_shinfo(skb)->xdp_frags_size;
else
skb->data_len = 0;
skb->protocol = eth_type_trans(skb, rq->dev);
metalen = xdp->data - xdp->data_meta;
if (metalen)
skb_metadata_set(skb, metalen);
out:
return skb;
drop:
stats->rx_drops++;
xdp_drop:
rcu_read_unlock();
kfree_skb(skb);
return NULL;
err_xdp:
rcu_read_unlock();
xdp_return_buff(xdp);
xdp_xmit:
return NULL;
}
static int veth_xdp_rcv(struct veth_rq *rq, int budget,
struct veth_xdp_tx_bq *bq,
struct veth_stats *stats)
{
int i, done = 0, n_xdpf = 0;
void *xdpf[VETH_XDP_BATCH];
for (i = 0; i < budget; i++) {
void *ptr = __ptr_ring_consume(&rq->xdp_ring);
if (!ptr)
break;
if (veth_is_xdp_frame(ptr)) {
/* ndo_xdp_xmit */
struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
stats->xdp_bytes += xdp_get_frame_len(frame);
frame = veth_xdp_rcv_one(rq, frame, bq, stats);
if (frame) {
/* XDP_PASS */
xdpf[n_xdpf++] = frame;
if (n_xdpf == VETH_XDP_BATCH) {
veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf,
bq, stats);
n_xdpf = 0;
}
}
} else {
/* ndo_start_xmit */
struct sk_buff *skb = ptr;
stats->xdp_bytes += skb->len;
skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
veth: ensure skb entering GRO are not cloned. After commit d3256efd8e8b ("veth: allow enabling NAPI even without XDP"), if GRO is enabled on a veth device and TSO is disabled on the peer device, TCP skbs will go through the NAPI callback. If there is no XDP program attached, the veth code does not perform any share check, and shared/cloned skbs could enter the GRO engine. Ignat reported a BUG triggered later-on due to the above condition: [ 53.970529][ C1] kernel BUG at net/core/skbuff.c:3574! [ 53.981755][ C1] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI [ 53.982634][ C1] CPU: 1 PID: 19 Comm: ksoftirqd/1 Not tainted 5.16.0-rc5+ #25 [ 53.982634][ C1] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 [ 53.982634][ C1] RIP: 0010:skb_shift+0x13ef/0x23b0 [ 53.982634][ C1] Code: ea 03 0f b6 04 02 48 89 fa 83 e2 07 38 d0 7f 08 84 c0 0f 85 41 0c 00 00 41 80 7f 02 00 4d 8d b5 d0 00 00 00 0f 85 74 f5 ff ff <0f> 0b 4d 8d 77 20 be 04 00 00 00 4c 89 44 24 78 4c 89 f7 4c 89 8c [ 53.982634][ C1] RSP: 0018:ffff8881008f7008 EFLAGS: 00010246 [ 53.982634][ C1] RAX: 0000000000000000 RBX: ffff8881180b4c80 RCX: 0000000000000000 [ 53.982634][ C1] RDX: 0000000000000002 RSI: ffff8881180b4d3c RDI: ffff88810bc9cac2 [ 53.982634][ C1] RBP: ffff8881008f70b8 R08: ffff8881180b4cf4 R09: ffff8881180b4cf0 [ 53.982634][ C1] R10: ffffed1022999e5c R11: 0000000000000002 R12: 0000000000000590 [ 53.982634][ C1] R13: ffff88810f940c80 R14: ffff88810f940d50 R15: ffff88810bc9cac0 [ 53.982634][ C1] FS: 0000000000000000(0000) GS:ffff888235880000(0000) knlGS:0000000000000000 [ 53.982634][ C1] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 53.982634][ C1] CR2: 00007ff5f9b86680 CR3: 0000000108ce8004 CR4: 0000000000170ee0 [ 53.982634][ C1] Call Trace: [ 53.982634][ C1] <TASK> [ 53.982634][ C1] tcp_sacktag_walk+0xaba/0x18e0 [ 53.982634][ C1] tcp_sacktag_write_queue+0xe7b/0x3460 [ 53.982634][ C1] tcp_ack+0x2666/0x54b0 [ 53.982634][ C1] tcp_rcv_established+0x4d9/0x20f0 [ 53.982634][ C1] tcp_v4_do_rcv+0x551/0x810 [ 53.982634][ C1] tcp_v4_rcv+0x22ed/0x2ed0 [ 53.982634][ C1] ip_protocol_deliver_rcu+0x96/0xaf0 [ 53.982634][ C1] ip_local_deliver_finish+0x1e0/0x2f0 [ 53.982634][ C1] ip_sublist_rcv_finish+0x211/0x440 [ 53.982634][ C1] ip_list_rcv_finish.constprop.0+0x424/0x660 [ 53.982634][ C1] ip_list_rcv+0x2c8/0x410 [ 53.982634][ C1] __netif_receive_skb_list_core+0x65c/0x910 [ 53.982634][ C1] netif_receive_skb_list_internal+0x5f9/0xcb0 [ 53.982634][ C1] napi_complete_done+0x188/0x6e0 [ 53.982634][ C1] gro_cell_poll+0x10c/0x1d0 [ 53.982634][ C1] __napi_poll+0xa1/0x530 [ 53.982634][ C1] net_rx_action+0x567/0x1270 [ 53.982634][ C1] __do_softirq+0x28a/0x9ba [ 53.982634][ C1] run_ksoftirqd+0x32/0x60 [ 53.982634][ C1] smpboot_thread_fn+0x559/0x8c0 [ 53.982634][ C1] kthread+0x3b9/0x490 [ 53.982634][ C1] ret_from_fork+0x22/0x30 [ 53.982634][ C1] </TASK> Address the issue by skipping the GRO stage for shared or cloned skbs. To reduce the chance of OoO, try to unclone the skbs before giving up. v1 -> v2: - use avoid skb_copy and fallback to netif_receive_skb - Eric Reported-by: Ignat Korchagin <ignat@cloudflare.com> Fixes: d3256efd8e8b ("veth: allow enabling NAPI even without XDP") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Tested-by: Ignat Korchagin <ignat@cloudflare.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/b5f61c5602aab01bac8d711d8d1bfab0a4817db7.1640197544.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-12-22 18:39:52 +00:00
if (skb) {
if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
netif_receive_skb(skb);
else
napi_gro_receive(&rq->xdp_napi, skb);
}
}
done++;
}
if (n_xdpf)
veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
u64_stats_update_begin(&rq->stats.syncp);
rq->stats.vs.xdp_redirect += stats->xdp_redirect;
rq->stats.vs.xdp_bytes += stats->xdp_bytes;
rq->stats.vs.xdp_drops += stats->xdp_drops;
rq->stats.vs.rx_drops += stats->rx_drops;
rq->stats.vs.xdp_packets += done;
u64_stats_update_end(&rq->stats.syncp);
return done;
}
static int veth_poll(struct napi_struct *napi, int budget)
{
struct veth_rq *rq =
container_of(napi, struct veth_rq, xdp_napi);
struct veth_stats stats = {};
struct veth_xdp_tx_bq bq;
int done;
bq.count = 0;
xdp_set_return_frame_no_direct();
done = veth_xdp_rcv(rq, budget, &bq, &stats);
veth: Fix race with AF_XDP exposing old or uninitialized descriptors When AF_XDP is used on on a veth interface the RX ring is updated in two steps. veth_xdp_rcv() removes packet descriptors from the FILL ring fills them and places them in the RX ring updating the cached_prod pointer. Later xdp_do_flush() syncs the RX ring prod pointer with the cached_prod pointer allowing user-space to see the recently filled in descriptors. The rings are intended to be SPSC, however the existing order in veth_poll allows the xdp_do_flush() to run concurrently with another CPU creating a race condition that allows user-space to see old or uninitialized descriptors in the RX ring. This bug has been observed in production systems. To summarize, we are expecting this ordering: CPU 0 __xsk_rcv_zc() CPU 0 __xsk_map_flush() CPU 2 __xsk_rcv_zc() CPU 2 __xsk_map_flush() But we are seeing this order: CPU 0 __xsk_rcv_zc() CPU 2 __xsk_rcv_zc() CPU 0 __xsk_map_flush() CPU 2 __xsk_map_flush() This occurs because we rely on NAPI to ensure that only one napi_poll handler is running at a time for the given veth receive queue. napi_schedule_prep() will prevent multiple instances from getting scheduled. However calling napi_complete_done() signals that this napi_poll is complete and allows subsequent calls to napi_schedule_prep() and __napi_schedule() to succeed in scheduling a concurrent napi_poll before the xdp_do_flush() has been called. For the veth driver a concurrent call to napi_schedule_prep() and __napi_schedule() can occur on a different CPU because the veth xmit path can additionally schedule a napi_poll creating the race. The fix as suggested by Magnus Karlsson, is to simply move the xdp_do_flush() call before napi_complete_done(). This syncs the producer ring pointers before another instance of napi_poll can be scheduled on another CPU. It will also slightly improve performance by moving the flush closer to when the descriptors were placed in the RX ring. Fixes: d1396004dd86 ("veth: Add XDP TX and REDIRECT") Suggested-by: Magnus Karlsson <magnus.karlsson@gmail.com> Signed-off-by: Shawn Bohrer <sbohrer@cloudflare.com> Link: https://lore.kernel.org/r/20221220185903.1105011-1-sbohrer@cloudflare.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-12-20 18:59:03 +00:00
if (stats.xdp_redirect > 0)
xdp_do_flush();
if (done < budget && napi_complete_done(napi, done)) {
/* Write rx_notify_masked before reading ptr_ring */
smp_store_mb(rq->rx_notify_masked, false);
if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
veth: fix races around rq->rx_notify_masked veth being NETIF_F_LLTX enabled, we need to be more careful whenever we read/write rq->rx_notify_masked. BUG: KCSAN: data-race in veth_xmit / veth_xmit write to 0xffff888133d9a9f8 of 1 bytes by task 23552 on cpu 0: __veth_xdp_flush drivers/net/veth.c:269 [inline] veth_xmit+0x307/0x470 drivers/net/veth.c:350 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 br_dev_queue_push_xmit+0x3ce/0x430 net/bridge/br_forward.c:53 NF_HOOK include/linux/netfilter.h:307 [inline] br_forward_finish net/bridge/br_forward.c:66 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] __br_forward+0x2e4/0x400 net/bridge/br_forward.c:115 br_flood+0x521/0x5c0 net/bridge/br_forward.c:242 br_dev_xmit+0x8b6/0x960 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 neigh_hh_output include/net/neighbour.h:525 [inline] neigh_output include/net/neighbour.h:539 [inline] ip_finish_output2+0x6f8/0xb70 net/ipv4/ip_output.c:228 ip_finish_output+0xfb/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:451 [inline] ip_local_out net/ipv4/ip_output.c:126 [inline] ip_send_skb+0x6e/0xe0 net/ipv4/ip_output.c:1570 udp_send_skb+0x641/0x880 net/ipv4/udp.c:967 udp_sendmsg+0x12ea/0x14c0 net/ipv4/udp.c:1254 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888133d9a9f8 of 1 bytes by task 23563 on cpu 1: __veth_xdp_flush drivers/net/veth.c:268 [inline] veth_xmit+0x2d6/0x470 drivers/net/veth.c:350 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 br_dev_queue_push_xmit+0x3ce/0x430 net/bridge/br_forward.c:53 NF_HOOK include/linux/netfilter.h:307 [inline] br_forward_finish net/bridge/br_forward.c:66 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] __br_forward+0x2e4/0x400 net/bridge/br_forward.c:115 br_flood+0x521/0x5c0 net/bridge/br_forward.c:242 br_dev_xmit+0x8b6/0x960 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 neigh_hh_output include/net/neighbour.h:525 [inline] neigh_output include/net/neighbour.h:539 [inline] ip_finish_output2+0x6f8/0xb70 net/ipv4/ip_output.c:228 ip_finish_output+0xfb/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:451 [inline] ip_local_out net/ipv4/ip_output.c:126 [inline] ip_send_skb+0x6e/0xe0 net/ipv4/ip_output.c:1570 udp_send_skb+0x641/0x880 net/ipv4/udp.c:967 udp_sendmsg+0x12ea/0x14c0 net/ipv4/udp.c:1254 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00 -> 0x01 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 23563 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00064-gc36c04c2e132 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 948d4f214fde ("veth: Add driver XDP") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-08 23:28:22 +00:00
if (napi_schedule_prep(&rq->xdp_napi)) {
WRITE_ONCE(rq->rx_notify_masked, true);
__napi_schedule(&rq->xdp_napi);
}
}
}
if (stats.xdp_tx > 0)
veth_xdp_flush(rq, &bq);
xdp_clear_return_frame_no_direct();
return done;
}
static int veth_create_page_pool(struct veth_rq *rq)
{
struct page_pool_params pp_params = {
.order = 0,
.pool_size = VETH_RING_SIZE,
.nid = NUMA_NO_NODE,
.dev = &rq->dev->dev,
};
rq->page_pool = page_pool_create(&pp_params);
if (IS_ERR(rq->page_pool)) {
int err = PTR_ERR(rq->page_pool);
rq->page_pool = NULL;
return err;
}
return 0;
}
static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
int err, i;
for (i = start; i < end; i++) {
err = veth_create_page_pool(&priv->rq[i]);
if (err)
goto err_page_pool;
}
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
if (err)
goto err_xdp_ring;
}
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
napi_enable(&rq->xdp_napi);
rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
}
return 0;
err_xdp_ring:
for (i--; i >= start; i--)
ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
i = end;
err_page_pool:
for (i--; i >= start; i--) {
page_pool_destroy(priv->rq[i].page_pool);
priv->rq[i].page_pool = NULL;
}
return err;
}
static int __veth_napi_enable(struct net_device *dev)
{
return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
}
static void veth_napi_del_range(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
int i;
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
rcu_assign_pointer(priv->rq[i].napi, NULL);
napi_disable(&rq->xdp_napi);
__netif_napi_del(&rq->xdp_napi);
}
synchronize_net();
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
rq->rx_notify_masked = false;
ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
}
for (i = start; i < end; i++) {
page_pool_destroy(priv->rq[i].page_pool);
priv->rq[i].page_pool = NULL;
}
}
static void veth_napi_del(struct net_device *dev)
{
veth_napi_del_range(dev, 0, dev->real_num_rx_queues);
}
static bool veth_gro_requested(const struct net_device *dev)
{
return !!(dev->wanted_features & NETIF_F_GRO);
}
static int veth_enable_xdp_range(struct net_device *dev, int start, int end,
bool napi_already_on)
{
struct veth_priv *priv = netdev_priv(dev);
int err, i;
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
if (!napi_already_on)
netif_napi_add(dev, &rq->xdp_napi, veth_poll);
err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
if (err < 0)
goto err_rxq_reg;
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_PAGE_SHARED,
NULL);
if (err < 0)
goto err_reg_mem;
/* Save original mem info as it can be overwritten */
rq->xdp_mem = rq->xdp_rxq.mem;
}
return 0;
err_reg_mem:
xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
err_rxq_reg:
for (i--; i >= start; i--) {
struct veth_rq *rq = &priv->rq[i];
xdp_rxq_info_unreg(&rq->xdp_rxq);
if (!napi_already_on)
netif_napi_del(&rq->xdp_napi);
}
return err;
}
static void veth_disable_xdp_range(struct net_device *dev, int start, int end,
bool delete_napi)
{
struct veth_priv *priv = netdev_priv(dev);
int i;
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
rq->xdp_rxq.mem = rq->xdp_mem;
xdp_rxq_info_unreg(&rq->xdp_rxq);
if (delete_napi)
netif_napi_del(&rq->xdp_napi);
}
}
static int veth_enable_xdp(struct net_device *dev)
{
bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
struct veth_priv *priv = netdev_priv(dev);
int err, i;
if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on);
if (err)
return err;
if (!napi_already_on) {
err = __veth_napi_enable(dev);
if (err) {
veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true);
return err;
}
}
}
for (i = 0; i < dev->real_num_rx_queues; i++) {
rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
}
return 0;
}
static void veth_disable_xdp(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
int i;
for (i = 0; i < dev->real_num_rx_queues; i++)
rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
net: veth: clear GRO when clearing XDP even when down veth sets NETIF_F_GRO automatically when XDP is enabled, because both features use the same NAPI machinery. The logic to clear NETIF_F_GRO sits in veth_disable_xdp() which is called both on ndo_stop and when XDP is turned off. To avoid the flag from being cleared when the device is brought down, the clearing is skipped when IFF_UP is not set. Bringing the device down should indeed not modify its features. Unfortunately, this means that clearing is also skipped when XDP is disabled _while_ the device is down. And there's nothing on the open path to bring the device features back into sync. IOW if user enables XDP, disables it and then brings the device up we'll end up with a stray GRO flag set but no NAPI instances. We don't depend on the GRO flag on the datapath, so the datapath won't crash. We will crash (or hang), however, next time features are sync'ed (either by user via ethtool or peer changing its config). The GRO flag will go away, and veth will try to disable the NAPIs. But the open path never created them since XDP was off, the GRO flag was a stray. If NAPI was initialized before we'll hang in napi_disable(). If it never was we'll crash trying to stop uninitialized hrtimer. Move the GRO flag updates to the XDP enable / disable paths, instead of mixing them with the ndo_open / ndo_close paths. Fixes: d3256efd8e8b ("veth: allow enabling NAPI even without XDP") Reported-by: Thomas Gleixner <tglx@linutronix.de> Reported-by: syzbot+039399a9b96297ddedca@syzkaller.appspotmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2024-02-21 23:12:10 +00:00
if (!netif_running(dev) || !veth_gro_requested(dev))
veth_napi_del(dev);
veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false);
}
static int veth_napi_enable_range(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
int err, i;
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
netif_napi_add(dev, &rq->xdp_napi, veth_poll);
}
err = __veth_napi_enable_range(dev, start, end);
if (err) {
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
netif_napi_del(&rq->xdp_napi);
}
return err;
}
return err;
}
static int veth_napi_enable(struct net_device *dev)
{
return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
}
static void veth_disable_range_safe(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
if (start >= end)
return;
if (priv->_xdp_prog) {
veth_napi_del_range(dev, start, end);
veth_disable_xdp_range(dev, start, end, false);
} else if (veth_gro_requested(dev)) {
veth_napi_del_range(dev, start, end);
}
}
static int veth_enable_range_safe(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
int err;
if (start >= end)
return 0;
if (priv->_xdp_prog) {
/* these channels are freshly initialized, napi is not on there even
* when GRO is requeste
*/
err = veth_enable_xdp_range(dev, start, end, false);
if (err)
return err;
err = __veth_napi_enable_range(dev, start, end);
if (err) {
/* on error always delete the newly added napis */
veth_disable_xdp_range(dev, start, end, true);
return err;
}
} else if (veth_gro_requested(dev)) {
return veth_napi_enable_range(dev, start, end);
}
return 0;
}
static void veth_set_xdp_features(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer;
veth: rely on rtnl_dereference() instead of on rcu_dereference() in veth_set_xdp_features() Fix the following kernel warning in veth_set_xdp_features routine relying on rtnl_dereference() instead of on rcu_dereference(): ============================= WARNING: suspicious RCU usage 6.3.0-rc1-00144-g064d70527aaa #149 Not tainted ----------------------------- drivers/net/veth.c:1265 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by ip/135: (net/core/rtnetlink.c:6172) stack backtrace: CPU: 1 PID: 135 Comm: ip Not tainted 6.3.0-rc1-00144-g064d70527aaa #149 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: <TASK> dump_stack_lvl (lib/dump_stack.c:107) lockdep_rcu_suspicious (include/linux/context_tracking.h:152) veth_set_xdp_features (drivers/net/veth.c:1265 (discriminator 9)) veth_newlink (drivers/net/veth.c:1892) ? veth_set_features (drivers/net/veth.c:1774) ? kasan_save_stack (mm/kasan/common.c:47) ? kasan_save_stack (mm/kasan/common.c:46) ? kasan_set_track (mm/kasan/common.c:52) ? alloc_netdev_mqs (include/linux/slab.h:737) ? rcu_read_lock_sched_held (kernel/rcu/update.c:125) ? trace_kmalloc (include/trace/events/kmem.h:54) ? __xdp_rxq_info_reg (net/core/xdp.c:188) ? alloc_netdev_mqs (net/core/dev.c:10657) ? rtnl_create_link (net/core/rtnetlink.c:3312) rtnl_newlink_create (net/core/rtnetlink.c:3440) ? rtnl_link_get_net_capable.constprop.0 (net/core/rtnetlink.c:3391) __rtnl_newlink (net/core/rtnetlink.c:3657) ? lock_downgrade (kernel/locking/lockdep.c:5321) ? rtnl_link_unregister (net/core/rtnetlink.c:3487) rtnl_newlink (net/core/rtnetlink.c:3671) rtnetlink_rcv_msg (net/core/rtnetlink.c:6174) ? rtnl_link_fill (net/core/rtnetlink.c:6070) ? mark_usage (kernel/locking/lockdep.c:4914) ? mark_usage (kernel/locking/lockdep.c:4914) netlink_rcv_skb (net/netlink/af_netlink.c:2574) ? rtnl_link_fill (net/core/rtnetlink.c:6070) ? netlink_ack (net/netlink/af_netlink.c:2551) ? lock_acquire (kernel/locking/lockdep.c:467) ? net_generic (include/linux/rcupdate.h:805) ? netlink_deliver_tap (include/linux/rcupdate.h:805) netlink_unicast (net/netlink/af_netlink.c:1340) ? netlink_attachskb (net/netlink/af_netlink.c:1350) netlink_sendmsg (net/netlink/af_netlink.c:1942) ? netlink_unicast (net/netlink/af_netlink.c:1861) ? netlink_unicast (net/netlink/af_netlink.c:1861) sock_sendmsg (net/socket.c:727) ____sys_sendmsg (net/socket.c:2501) ? kernel_sendmsg (net/socket.c:2448) ? __copy_msghdr (net/socket.c:2428) ___sys_sendmsg (net/socket.c:2557) ? mark_usage (kernel/locking/lockdep.c:4914) ? do_recvmmsg (net/socket.c:2544) ? lock_acquire (kernel/locking/lockdep.c:467) ? find_held_lock (kernel/locking/lockdep.c:5159) ? __lock_release (kernel/locking/lockdep.c:5345) ? __might_fault (mm/memory.c:5625) ? lock_downgrade (kernel/locking/lockdep.c:5321) ? __fget_light (include/linux/atomic/atomic-arch-fallback.h:227) __sys_sendmsg (include/linux/file.h:31) ? __sys_sendmsg_sock (net/socket.c:2572) ? rseq_get_rseq_cs (kernel/rseq.c:275) ? lockdep_hardirqs_on_prepare.part.0 (kernel/locking/lockdep.c:4263) do_syscall_64 (arch/x86/entry/common.c:50) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7f0d1aadeb17 Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 Fixes: fccca038f300 ("veth: take into account device reconfiguration for xdp_features flag") Suggested-by: Eric Dumazet <edumazet@google.com> Reported-by: Matthieu Baerts <matthieu.baerts@tessares.net> Link: https://lore.kernel.org/netdev/cover.1678364612.git.lorenzo@kernel.org/T/#me4c9d8e985ec7ebee981cfdb5bc5ec651ef4035d Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> Reported-by: syzbot+c3d0d9c42d59ff644ea6@syzkaller.appspotmail.com Reviewed-by: Eric Dumazet <edumazet@google.com> Tested-by: Matthieu Baerts <matthieu.baerts@tessares.net> Link: https://lore.kernel.org/r/dfd6a9a7d85e9113063165e1f47b466b90ad7b8a.1678748579.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-03-13 23:08:40 +00:00
peer = rtnl_dereference(priv->peer);
if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) {
struct veth_priv *priv_peer = netdev_priv(peer);
xdp_features_t val = NETDEV_XDP_ACT_BASIC |
NETDEV_XDP_ACT_REDIRECT |
NETDEV_XDP_ACT_RX_SG;
if (priv_peer->_xdp_prog || veth_gro_requested(peer))
val |= NETDEV_XDP_ACT_NDO_XMIT |
NETDEV_XDP_ACT_NDO_XMIT_SG;
xdp_set_features_flag(dev, val);
} else {
xdp_clear_features_flag(dev);
}
}
static int veth_set_channels(struct net_device *dev,
struct ethtool_channels *ch)
{
struct veth_priv *priv = netdev_priv(dev);
unsigned int old_rx_count, new_rx_count;
struct veth_priv *peer_priv;
struct net_device *peer;
int err;
/* sanity check. Upper bounds are already enforced by the caller */
if (!ch->rx_count || !ch->tx_count)
return -EINVAL;
/* avoid braking XDP, if that is enabled */
peer = rtnl_dereference(priv->peer);
peer_priv = peer ? netdev_priv(peer) : NULL;
if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues)
return -EINVAL;
if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues)
return -EINVAL;
old_rx_count = dev->real_num_rx_queues;
new_rx_count = ch->rx_count;
if (netif_running(dev)) {
/* turn device off */
netif_carrier_off(dev);
if (peer)
netif_carrier_off(peer);
/* try to allocate new resurces, as needed*/
err = veth_enable_range_safe(dev, old_rx_count, new_rx_count);
if (err)
goto out;
}
err = netif_set_real_num_rx_queues(dev, ch->rx_count);
if (err)
goto revert;
err = netif_set_real_num_tx_queues(dev, ch->tx_count);
if (err) {
int err2 = netif_set_real_num_rx_queues(dev, old_rx_count);
/* this error condition could happen only if rx and tx change
* in opposite directions (e.g. tx nr raises, rx nr decreases)
* and we can't do anything to fully restore the original
* status
*/
if (err2)
pr_warn("Can't restore rx queues config %d -> %d %d",
new_rx_count, old_rx_count, err2);
else
goto revert;
}
out:
if (netif_running(dev)) {
/* note that we need to swap the arguments WRT the enable part
* to identify the range we have to disable
*/
veth_disable_range_safe(dev, new_rx_count, old_rx_count);
netif_carrier_on(dev);
if (peer)
netif_carrier_on(peer);
}
/* update XDP supported features */
veth_set_xdp_features(dev);
if (peer)
veth_set_xdp_features(peer);
return err;
revert:
new_rx_count = old_rx_count;
old_rx_count = ch->rx_count;
goto out;
}
static int veth_open(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
struct net_device *peer = rtnl_dereference(priv->peer);
int err;
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
if (!peer)
return -ENOTCONN;
if (priv->_xdp_prog) {
err = veth_enable_xdp(dev);
if (err)
return err;
} else if (veth_gro_requested(dev)) {
err = veth_napi_enable(dev);
if (err)
return err;
}
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
if (peer->flags & IFF_UP) {
netif_carrier_on(dev);
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
netif_carrier_on(peer);
}
veth_set_xdp_features(dev);
return 0;
}
static int veth_close(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer = rtnl_dereference(priv->peer);
netif_carrier_off(dev);
if (peer)
netif_carrier_off(peer);
if (priv->_xdp_prog)
veth_disable_xdp(dev);
else if (veth_gro_requested(dev))
veth_napi_del(dev);
return 0;
}
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 17:55:20 +00:00
static int is_valid_veth_mtu(int mtu)
{
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 17:55:20 +00:00
return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
}
veth: Free queues on link delete David Ahern reported memory leak in veth. ======================================================================= $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800354d5c00 (size 1024): comm "ip", pid 836, jiffies 4294722952 (age 25.904s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x147/0x3ac [veth] ... unreferenced object 0xffff88002e009c00 (size 1024): comm "ip", pid 836, jiffies 4294722958 (age 25.898s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x219/0x3ac [veth] ======================================================================= veth_rq allocated in veth_newlink() was not freed on dellink. We need to free up them after veth_close() so that any packets will not reference the queues afterwards. Thus free them in veth_dev_free() in the same way as freeing stats structure (vstats). Also move queues allocation to veth_dev_init() to be in line with stats allocation. Fixes: 638264dc90227 ("veth: Support per queue XDP ring") Reported-by: David Ahern <dsahern@gmail.com> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Reviewed-by: David Ahern <dsahern@gmail.com> Tested-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-08-15 08:07:29 +00:00
static int veth_alloc_queues(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
int i;
priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq),
GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
veth: Free queues on link delete David Ahern reported memory leak in veth. ======================================================================= $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800354d5c00 (size 1024): comm "ip", pid 836, jiffies 4294722952 (age 25.904s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x147/0x3ac [veth] ... unreferenced object 0xffff88002e009c00 (size 1024): comm "ip", pid 836, jiffies 4294722958 (age 25.898s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x219/0x3ac [veth] ======================================================================= veth_rq allocated in veth_newlink() was not freed on dellink. We need to free up them after veth_close() so that any packets will not reference the queues afterwards. Thus free them in veth_dev_free() in the same way as freeing stats structure (vstats). Also move queues allocation to veth_dev_init() to be in line with stats allocation. Fixes: 638264dc90227 ("veth: Support per queue XDP ring") Reported-by: David Ahern <dsahern@gmail.com> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Reviewed-by: David Ahern <dsahern@gmail.com> Tested-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-08-15 08:07:29 +00:00
if (!priv->rq)
return -ENOMEM;
for (i = 0; i < dev->num_rx_queues; i++) {
veth: Free queues on link delete David Ahern reported memory leak in veth. ======================================================================= $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800354d5c00 (size 1024): comm "ip", pid 836, jiffies 4294722952 (age 25.904s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x147/0x3ac [veth] ... unreferenced object 0xffff88002e009c00 (size 1024): comm "ip", pid 836, jiffies 4294722958 (age 25.898s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x219/0x3ac [veth] ======================================================================= veth_rq allocated in veth_newlink() was not freed on dellink. We need to free up them after veth_close() so that any packets will not reference the queues afterwards. Thus free them in veth_dev_free() in the same way as freeing stats structure (vstats). Also move queues allocation to veth_dev_init() to be in line with stats allocation. Fixes: 638264dc90227 ("veth: Support per queue XDP ring") Reported-by: David Ahern <dsahern@gmail.com> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Reviewed-by: David Ahern <dsahern@gmail.com> Tested-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-08-15 08:07:29 +00:00
priv->rq[i].dev = dev;
u64_stats_init(&priv->rq[i].stats.syncp);
}
veth: Free queues on link delete David Ahern reported memory leak in veth. ======================================================================= $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800354d5c00 (size 1024): comm "ip", pid 836, jiffies 4294722952 (age 25.904s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x147/0x3ac [veth] ... unreferenced object 0xffff88002e009c00 (size 1024): comm "ip", pid 836, jiffies 4294722958 (age 25.898s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x219/0x3ac [veth] ======================================================================= veth_rq allocated in veth_newlink() was not freed on dellink. We need to free up them after veth_close() so that any packets will not reference the queues afterwards. Thus free them in veth_dev_free() in the same way as freeing stats structure (vstats). Also move queues allocation to veth_dev_init() to be in line with stats allocation. Fixes: 638264dc90227 ("veth: Support per queue XDP ring") Reported-by: David Ahern <dsahern@gmail.com> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Reviewed-by: David Ahern <dsahern@gmail.com> Tested-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-08-15 08:07:29 +00:00
return 0;
}
static void veth_free_queues(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
kvfree(priv->rq);
veth: Free queues on link delete David Ahern reported memory leak in veth. ======================================================================= $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800354d5c00 (size 1024): comm "ip", pid 836, jiffies 4294722952 (age 25.904s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x147/0x3ac [veth] ... unreferenced object 0xffff88002e009c00 (size 1024): comm "ip", pid 836, jiffies 4294722958 (age 25.898s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x219/0x3ac [veth] ======================================================================= veth_rq allocated in veth_newlink() was not freed on dellink. We need to free up them after veth_close() so that any packets will not reference the queues afterwards. Thus free them in veth_dev_free() in the same way as freeing stats structure (vstats). Also move queues allocation to veth_dev_init() to be in line with stats allocation. Fixes: 638264dc90227 ("veth: Support per queue XDP ring") Reported-by: David Ahern <dsahern@gmail.com> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Reviewed-by: David Ahern <dsahern@gmail.com> Tested-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-08-15 08:07:29 +00:00
}
static int veth_dev_init(struct net_device *dev)
{
net: add netdev_lockdep_set_classes() to virtual drivers Based on a syzbot report, it appears many virtual drivers do not yet use netdev_lockdep_set_classes(), triggerring lockdep false positives. WARNING: possible recursive locking detected 6.8.0-rc4-next-20240212-syzkaller #0 Not tainted syz-executor.0/19016 is trying to acquire lock: ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline] ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340 but task is already holding lock: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline] ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340 other info that might help us debug this: Possible unsafe locking scenario: CPU0 lock(_xmit_ETHER#2); lock(_xmit_ETHER#2); *** DEADLOCK *** May be due to missing lock nesting notation 9 locks held by syz-executor.0/19016: #0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock net/core/rtnetlink.c:79 [inline] #0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x82c/0x1040 net/core/rtnetlink.c:6603 #1: ffffc90000a08c00 ((&in_dev->mr_ifc_timer)){+.-.}-{0:0}, at: call_timer_fn+0xc0/0x600 kernel/time/timer.c:1697 #2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline] #2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline] #2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228 #3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline] #3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline] #3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284 #4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline] #4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline] #4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline] #4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325 #5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] #5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline] #5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340 #6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline] #6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline] #6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228 #7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline] #7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline] #7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284 #8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline] #8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline] #8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline] #8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325 stack backtrace: CPU: 1 PID: 19016 Comm: syz-executor.0 Not tainted 6.8.0-rc4-next-20240212-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 Call Trace: <IRQ> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 check_deadlock kernel/locking/lockdep.c:3062 [inline] validate_chain+0x15c1/0x58e0 kernel/locking/lockdep.c:3856 __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1e4/0x530 kernel/locking/lockdep.c:5754 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] __netif_tx_lock include/linux/netdevice.h:4452 [inline] sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340 __dev_xmit_skb net/core/dev.c:3784 [inline] __dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325 neigh_output include/net/neighbour.h:542 [inline] ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235 iptunnel_xmit+0x540/0x9b0 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x20ee/0x2960 net/ipv4/ip_tunnel.c:831 erspan_xmit+0x9de/0x1460 net/ipv4/ip_gre.c:720 __netdev_start_xmit include/linux/netdevice.h:4989 [inline] netdev_start_xmit include/linux/netdevice.h:5003 [inline] xmit_one net/core/dev.c:3555 [inline] dev_hard_start_xmit+0x242/0x770 net/core/dev.c:3571 sch_direct_xmit+0x2b6/0x5f0 net/sched/sch_generic.c:342 __dev_xmit_skb net/core/dev.c:3784 [inline] __dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325 neigh_output include/net/neighbour.h:542 [inline] ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235 igmpv3_send_cr net/ipv4/igmp.c:723 [inline] igmp_ifc_timer_expire+0xb71/0xd90 net/ipv4/igmp.c:813 call_timer_fn+0x17e/0x600 kernel/time/timer.c:1700 expire_timers kernel/time/timer.c:1751 [inline] __run_timers+0x621/0x830 kernel/time/timer.c:2038 run_timer_softirq+0x67/0xf0 kernel/time/timer.c:2051 __do_softirq+0x2bc/0x943 kernel/softirq.c:554 invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu+0xf2/0x1c0 kernel/softirq.c:633 irq_exit_rcu+0x9/0x30 kernel/softirq.c:645 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1076 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1076 </IRQ> <TASK> asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 RIP: 0010:resched_offsets_ok kernel/sched/core.c:10127 [inline] RIP: 0010:__might_resched+0x16f/0x780 kernel/sched/core.c:10142 Code: 00 4c 89 e8 48 c1 e8 03 48 ba 00 00 00 00 00 fc ff df 48 89 44 24 38 0f b6 04 10 84 c0 0f 85 87 04 00 00 41 8b 45 00 c1 e0 08 <01> d8 44 39 e0 0f 85 d6 00 00 00 44 89 64 24 1c 48 8d bc 24 a0 00 RSP: 0018:ffffc9000ee069e0 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8880296a9e00 RDX: dffffc0000000000 RSI: ffff8880296a9e00 RDI: ffffffff8bfe8fa0 RBP: ffffc9000ee06b00 R08: ffffffff82326877 R09: 1ffff11002b5ad1b R10: dffffc0000000000 R11: ffffed1002b5ad1c R12: 0000000000000000 R13: ffff8880296aa23c R14: 000000000000062a R15: 1ffff92001dc0d44 down_write+0x19/0x50 kernel/locking/rwsem.c:1578 kernfs_activate fs/kernfs/dir.c:1403 [inline] kernfs_add_one+0x4af/0x8b0 fs/kernfs/dir.c:819 __kernfs_create_file+0x22e/0x2e0 fs/kernfs/file.c:1056 sysfs_add_file_mode_ns+0x24a/0x310 fs/sysfs/file.c:307 create_files fs/sysfs/group.c:64 [inline] internal_create_group+0x4f4/0xf20 fs/sysfs/group.c:152 internal_create_groups fs/sysfs/group.c:192 [inline] sysfs_create_groups+0x56/0x120 fs/sysfs/group.c:218 create_dir lib/kobject.c:78 [inline] kobject_add_internal+0x472/0x8d0 lib/kobject.c:240 kobject_add_varg lib/kobject.c:374 [inline] kobject_init_and_add+0x124/0x190 lib/kobject.c:457 netdev_queue_add_kobject net/core/net-sysfs.c:1706 [inline] netdev_queue_update_kobjects+0x1f3/0x480 net/core/net-sysfs.c:1758 register_queue_kobjects net/core/net-sysfs.c:1819 [inline] netdev_register_kobject+0x265/0x310 net/core/net-sysfs.c:2059 register_netdevice+0x1191/0x19c0 net/core/dev.c:10298 bond_newlink+0x3b/0x90 drivers/net/bonding/bond_netlink.c:576 rtnl_newlink_create net/core/rtnetlink.c:3506 [inline] __rtnl_newlink net/core/rtnetlink.c:3726 [inline] rtnl_newlink+0x158f/0x20a0 net/core/rtnetlink.c:3739 rtnetlink_rcv_msg+0x885/0x1040 net/core/rtnetlink.c:6606 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543 netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367 netlink_sendmsg+0xa3c/0xd70 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 __sys_sendto+0x3a4/0x4f0 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0xde/0x100 net/socket.c:2199 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7fc3fa87fa9c Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20240212140700.2795436-4-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-02-12 14:07:00 +00:00
netdev_lockdep_set_classes(dev);
return veth_alloc_queues(dev);
}
static void veth_dev_free(struct net_device *dev)
{
veth: Free queues on link delete David Ahern reported memory leak in veth. ======================================================================= $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800354d5c00 (size 1024): comm "ip", pid 836, jiffies 4294722952 (age 25.904s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x147/0x3ac [veth] ... unreferenced object 0xffff88002e009c00 (size 1024): comm "ip", pid 836, jiffies 4294722958 (age 25.898s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x219/0x3ac [veth] ======================================================================= veth_rq allocated in veth_newlink() was not freed on dellink. We need to free up them after veth_close() so that any packets will not reference the queues afterwards. Thus free them in veth_dev_free() in the same way as freeing stats structure (vstats). Also move queues allocation to veth_dev_init() to be in line with stats allocation. Fixes: 638264dc90227 ("veth: Support per queue XDP ring") Reported-by: David Ahern <dsahern@gmail.com> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Reviewed-by: David Ahern <dsahern@gmail.com> Tested-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-08-15 08:07:29 +00:00
veth_free_queues(dev);
}
#ifdef CONFIG_NET_POLL_CONTROLLER
static void veth_poll_controller(struct net_device *dev)
{
/* veth only receives frames when its peer sends one
* Since it has nothing to do with disabling irqs, we are guaranteed
* never to have pending data when we poll for it so
* there is nothing to do here.
*
* We need this though so netpoll recognizes us as an interface that
* supports polling, which enables bridge devices in virt setups to
* still use netconsole
*/
}
#endif /* CONFIG_NET_POLL_CONTROLLER */
static int veth_get_iflink(const struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer;
int iflink;
rcu_read_lock();
peer = rcu_dereference(priv->peer);
iflink = peer ? READ_ONCE(peer->ifindex) : 0;
rcu_read_unlock();
return iflink;
}
static netdev_features_t veth_fix_features(struct net_device *dev,
netdev_features_t features)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer;
peer = rtnl_dereference(priv->peer);
if (peer) {
struct veth_priv *peer_priv = netdev_priv(peer);
if (peer_priv->_xdp_prog)
features &= ~NETIF_F_GSO_SOFTWARE;
}
return features;
}
static int veth_set_features(struct net_device *dev,
netdev_features_t features)
{
netdev_features_t changed = features ^ dev->features;
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer;
int err;
if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
return 0;
peer = rtnl_dereference(priv->peer);
if (features & NETIF_F_GRO) {
err = veth_napi_enable(dev);
if (err)
return err;
if (peer)
xdp_features_set_redirect_target(peer, true);
} else {
if (peer)
xdp_features_clear_redirect_target(peer);
veth_napi_del(dev);
}
return 0;
}
static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
{
struct veth_priv *peer_priv, *priv = netdev_priv(dev);
struct net_device *peer;
if (new_hr < 0)
new_hr = 0;
rcu_read_lock();
peer = rcu_dereference(priv->peer);
if (unlikely(!peer))
goto out;
peer_priv = netdev_priv(peer);
priv->requested_headroom = new_hr;
new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
dev->needed_headroom = new_hr;
peer->needed_headroom = new_hr;
out:
rcu_read_unlock();
}
static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
struct veth_priv *priv = netdev_priv(dev);
struct bpf_prog *old_prog;
struct net_device *peer;
unsigned int max_mtu;
int err;
old_prog = priv->_xdp_prog;
priv->_xdp_prog = prog;
peer = rtnl_dereference(priv->peer);
if (prog) {
if (!peer) {
NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
err = -ENOTCONN;
goto err;
}
max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) -
peer->hard_header_len;
/* Allow increasing the max_mtu if the program supports
* XDP fragments.
*/
if (prog->aux->xdp_has_frags)
max_mtu += PAGE_SIZE * MAX_SKB_FRAGS;
if (peer->mtu > max_mtu) {
NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
err = -ERANGE;
goto err;
}
if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
err = -ENOSPC;
goto err;
}
if (dev->flags & IFF_UP) {
err = veth_enable_xdp(dev);
if (err) {
NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
goto err;
}
}
if (!old_prog) {
peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
peer->max_mtu = max_mtu;
}
xdp_features_set_redirect_target(peer, true);
}
if (old_prog) {
if (!prog) {
if (peer && !veth_gro_requested(dev))
xdp_features_clear_redirect_target(peer);
if (dev->flags & IFF_UP)
veth_disable_xdp(dev);
if (peer) {
peer->hw_features |= NETIF_F_GSO_SOFTWARE;
peer->max_mtu = ETH_MAX_MTU;
}
}
bpf_prog_put(old_prog);
}
if ((!!old_prog ^ !!prog) && peer)
netdev_update_features(peer);
return 0;
err:
priv->_xdp_prog = old_prog;
return err;
}
static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
{
switch (xdp->command) {
case XDP_SETUP_PROG:
return veth_xdp_set(dev, xdp->prog, xdp->extack);
default:
return -EINVAL;
}
}
static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
{
struct veth_xdp_buff *_ctx = (void *)ctx;
if (!_ctx->skb)
return -ENODATA;
*timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp;
return 0;
}
xdp: rss hash types representation The RSS hash type specifies what portion of packet data NIC hardware used when calculating RSS hash value. The RSS types are focused on Internet traffic protocols at OSI layers L3 and L4. L2 (e.g. ARP) often get hash value zero and no RSS type. For L3 focused on IPv4 vs. IPv6, and L4 primarily TCP vs UDP, but some hardware supports SCTP. Hardware RSS types are differently encoded for each hardware NIC. Most hardware represent RSS hash type as a number. Determining L3 vs L4 often requires a mapping table as there often isn't a pattern or sorting according to ISO layer. The patch introduce a XDP RSS hash type (enum xdp_rss_hash_type) that contains both BITs for the L3/L4 types, and combinations to be used by drivers for their mapping tables. The enum xdp_rss_type_bits get exposed to BPF via BTF, and it is up to the BPF-programmer to match using these defines. This proposal change the kfunc API bpf_xdp_metadata_rx_hash() adding a pointer value argument for provide the RSS hash type. Change signature for all xmo_rx_hash calls in drivers to make it compile. The RSS type implementations for each driver comes as separate patches. Fixes: 3d76a4d3d4e5 ("bpf: XDP metadata RX kfuncs") Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Toke Høiland-Jørgensen <toke@redhat.com> Acked-by: Stanislav Fomichev <sdf@google.com> Link: https://lore.kernel.org/r/168132892042.340624.582563003880565460.stgit@firesoul Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-04-12 19:48:40 +00:00
static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
enum xdp_rss_hash_type *rss_type)
{
struct veth_xdp_buff *_ctx = (void *)ctx;
struct sk_buff *skb = _ctx->skb;
if (!skb)
return -ENODATA;
*hash = skb_get_hash(skb);
*rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE;
return 0;
}
static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto,
u16 *vlan_tci)
{
const struct veth_xdp_buff *_ctx = (void *)ctx;
const struct sk_buff *skb = _ctx->skb;
int err;
if (!skb)
return -ENODATA;
err = __vlan_hwaccel_get_tag(skb, vlan_tci);
if (err)
return err;
*vlan_proto = skb->vlan_proto;
return err;
}
static const struct net_device_ops veth_netdev_ops = {
.ndo_init = veth_dev_init,
.ndo_open = veth_open,
.ndo_stop = veth_close,
.ndo_start_xmit = veth_xmit,
.ndo_get_stats64 = veth_get_stats64,
.ndo_set_rx_mode = veth_set_multicast_list,
.ndo_set_mac_address = eth_mac_addr,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = veth_poll_controller,
#endif
.ndo_get_iflink = veth_get_iflink,
.ndo_fix_features = veth_fix_features,
.ndo_set_features = veth_set_features,
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = veth_set_rx_headroom,
.ndo_bpf = veth_xdp,
.ndo_xdp_xmit = veth_ndo_xdp_xmit,
bpf: Add redirect_peer helper Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net
2020-10-10 23:40:02 +00:00
.ndo_get_peer_dev = veth_peer_dev,
};
static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
.xmo_rx_timestamp = veth_xdp_rx_timestamp,
.xmo_rx_hash = veth_xdp_rx_hash,
.xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag,
};
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
static void veth_setup(struct net_device *dev)
{
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev->priv_flags |= IFF_NO_QUEUE;
dev->priv_flags |= IFF_PHONY_HEADROOM;
dev->netdev_ops = &veth_netdev_ops;
dev->xdp_metadata_ops = &veth_xdp_metadata_ops;
dev->ethtool_ops = &veth_ethtool_ops;
dev->features |= NETIF_F_LLTX;
dev->features |= VETH_FEATURES;
dev->vlan_features = dev->features &
~(NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX |
NETIF_F_HW_VLAN_CTAG_RX |
NETIF_F_HW_VLAN_STAG_RX);
net: Fix inconsistent teardown and release of private netdev state. Network devices can allocate reasources and private memory using netdev_ops->ndo_init(). However, the release of these resources can occur in one of two different places. Either netdev_ops->ndo_uninit() or netdev->destructor(). The decision of which operation frees the resources depends upon whether it is necessary for all netdev refs to be released before it is safe to perform the freeing. netdev_ops->ndo_uninit() presumably can occur right after the NETDEV_UNREGISTER notifier completes and the unicast and multicast address lists are flushed. netdev->destructor(), on the other hand, does not run until the netdev references all go away. Further complicating the situation is that netdev->destructor() almost universally does also a free_netdev(). This creates a problem for the logic in register_netdevice(). Because all callers of register_netdevice() manage the freeing of the netdev, and invoke free_netdev(dev) if register_netdevice() fails. If netdev_ops->ndo_init() succeeds, but something else fails inside of register_netdevice(), it does call ndo_ops->ndo_uninit(). But it is not able to invoke netdev->destructor(). This is because netdev->destructor() will do a free_netdev() and then the caller of register_netdevice() will do the same. However, this means that the resources that would normally be released by netdev->destructor() will not be. Over the years drivers have added local hacks to deal with this, by invoking their destructor parts by hand when register_netdevice() fails. Many drivers do not try to deal with this, and instead we have leaks. Let's close this hole by formalizing the distinction between what private things need to be freed up by netdev->destructor() and whether the driver needs unregister_netdevice() to perform the free_netdev(). netdev->priv_destructor() performs all actions to free up the private resources that used to be freed by netdev->destructor(), except for free_netdev(). netdev->needs_free_netdev is a boolean that indicates whether free_netdev() should be done at the end of unregister_netdevice(). Now, register_netdevice() can sanely release all resources after ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit() and netdev->priv_destructor(). And at the end of unregister_netdevice(), we invoke netdev->priv_destructor() and optionally call free_netdev(). Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-08 16:52:56 +00:00
dev->needs_free_netdev = true;
dev->priv_destructor = veth_dev_free;
2023-11-14 00:42:16 +00:00
dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 17:55:20 +00:00
dev->max_mtu = ETH_MAX_MTU;
dev->hw_features = VETH_FEATURES;
dev->hw_enc_features = VETH_FEATURES;
dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
netif_set_tso_max_size(dev, GSO_MAX_SIZE);
}
/*
* netlink interface
*/
static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
if (tb[IFLA_ADDRESS]) {
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
return -EINVAL;
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
return -EADDRNOTAVAIL;
}
if (tb[IFLA_MTU]) {
if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
return -EINVAL;
}
return 0;
}
static struct rtnl_link_ops veth_link_ops;
static void veth_disable_gro(struct net_device *dev)
{
dev->features &= ~NETIF_F_GRO;
dev->wanted_features &= ~NETIF_F_GRO;
netdev_update_features(dev);
}
static int veth_init_queues(struct net_device *dev, struct nlattr *tb[])
{
int err;
if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) {
err = netif_set_real_num_tx_queues(dev, 1);
if (err)
return err;
}
if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) {
err = netif_set_real_num_rx_queues(dev, 1);
if (err)
return err;
}
return 0;
}
static int veth_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
veth: Free queues on link delete David Ahern reported memory leak in veth. ======================================================================= $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800354d5c00 (size 1024): comm "ip", pid 836, jiffies 4294722952 (age 25.904s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x147/0x3ac [veth] ... unreferenced object 0xffff88002e009c00 (size 1024): comm "ip", pid 836, jiffies 4294722958 (age 25.898s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x219/0x3ac [veth] ======================================================================= veth_rq allocated in veth_newlink() was not freed on dellink. We need to free up them after veth_close() so that any packets will not reference the queues afterwards. Thus free them in veth_dev_free() in the same way as freeing stats structure (vstats). Also move queues allocation to veth_dev_init() to be in line with stats allocation. Fixes: 638264dc90227 ("veth: Support per queue XDP ring") Reported-by: David Ahern <dsahern@gmail.com> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Reviewed-by: David Ahern <dsahern@gmail.com> Tested-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-08-15 08:07:29 +00:00
int err;
struct net_device *peer;
struct veth_priv *priv;
char ifname[IFNAMSIZ];
struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
unsigned char name_assign_type;
rtnetlink: support specifying device flags on device creation commit e8469ed959c373c2ff9e6f488aa5a14971aebe1f Author: Patrick McHardy <kaber@trash.net> Date: Tue Feb 23 20:41:30 2010 +0100 Support specifying the initial device flags when creating a device though rtnl_link. Devices allocated by rtnl_create_link() are marked as INITIALIZING in order to surpress netlink registration notifications. To complete setup, rtnl_configure_link() must be called, which performs the device flag changes and invokes the deferred notifiers if everything went well. Two examples: # add macvlan to eth0 # $ ip link add link eth0 up allmulticast on type macvlan [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN link/ether 26:f8:84:02:f9:2a brd ff:ff:ff:ff:ff:ff [ROUTE]ff00::/8 dev macvlan0 table local metric 256 mtu 1500 advmss 1440 hoplimit 0 [ROUTE]fe80::/64 dev macvlan0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit 0 [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 link/ether 26:f8:84:02:f9:2a [ADDR]11: macvlan0 inet6 fe80::24f8:84ff:fe02:f92a/64 scope link valid_lft forever preferred_lft forever [ROUTE]local fe80::24f8:84ff:fe02:f92a via :: dev lo table local proto none metric 0 mtu 16436 advmss 16376 hoplimit 0 [ROUTE]default via fe80::215:e9ff:fef0:10f8 dev macvlan0 proto kernel metric 1024 mtu 1500 advmss 1440 hoplimit 0 [NEIGH]fe80::215:e9ff:fef0:10f8 dev macvlan0 lladdr 00:15:e9:f0:10:f8 router STALE [ROUTE]2001:6f8:974::/64 dev macvlan0 proto kernel metric 256 expires 0sec mtu 1500 advmss 1440 hoplimit 0 [PREFIX]prefix 2001:6f8:974::/64 dev macvlan0 onlink autoconf valid 14400 preferred 131084 [ADDR]11: macvlan0 inet6 2001:6f8:974:0:24f8:84ff:fe02:f92a/64 scope global dynamic valid_lft 86399sec preferred_lft 14399sec # add VLAN to eth1, eth1 is down # $ ip link add link eth1 up type vlan id 1000 RTNETLINK answers: Network is down <no events> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-02-26 06:34:54 +00:00
struct ifinfomsg *ifmp;
struct net *net;
/*
* create and register peer first
*/
if (data != NULL && data[VETH_INFO_PEER] != NULL) {
struct nlattr *nla_peer;
nla_peer = data[VETH_INFO_PEER];
rtnetlink: support specifying device flags on device creation commit e8469ed959c373c2ff9e6f488aa5a14971aebe1f Author: Patrick McHardy <kaber@trash.net> Date: Tue Feb 23 20:41:30 2010 +0100 Support specifying the initial device flags when creating a device though rtnl_link. Devices allocated by rtnl_create_link() are marked as INITIALIZING in order to surpress netlink registration notifications. To complete setup, rtnl_configure_link() must be called, which performs the device flag changes and invokes the deferred notifiers if everything went well. Two examples: # add macvlan to eth0 # $ ip link add link eth0 up allmulticast on type macvlan [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN link/ether 26:f8:84:02:f9:2a brd ff:ff:ff:ff:ff:ff [ROUTE]ff00::/8 dev macvlan0 table local metric 256 mtu 1500 advmss 1440 hoplimit 0 [ROUTE]fe80::/64 dev macvlan0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit 0 [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 link/ether 26:f8:84:02:f9:2a [ADDR]11: macvlan0 inet6 fe80::24f8:84ff:fe02:f92a/64 scope link valid_lft forever preferred_lft forever [ROUTE]local fe80::24f8:84ff:fe02:f92a via :: dev lo table local proto none metric 0 mtu 16436 advmss 16376 hoplimit 0 [ROUTE]default via fe80::215:e9ff:fef0:10f8 dev macvlan0 proto kernel metric 1024 mtu 1500 advmss 1440 hoplimit 0 [NEIGH]fe80::215:e9ff:fef0:10f8 dev macvlan0 lladdr 00:15:e9:f0:10:f8 router STALE [ROUTE]2001:6f8:974::/64 dev macvlan0 proto kernel metric 256 expires 0sec mtu 1500 advmss 1440 hoplimit 0 [PREFIX]prefix 2001:6f8:974::/64 dev macvlan0 onlink autoconf valid 14400 preferred 131084 [ADDR]11: macvlan0 inet6 2001:6f8:974:0:24f8:84ff:fe02:f92a/64 scope global dynamic valid_lft 86399sec preferred_lft 14399sec # add VLAN to eth1, eth1 is down # $ ip link add link eth1 up type vlan id 1000 RTNETLINK answers: Network is down <no events> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-02-26 06:34:54 +00:00
ifmp = nla_data(nla_peer);
2023-08-19 01:26:02 +00:00
err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
if (err < 0)
return err;
err = veth_validate(peer_tb, NULL, extack);
if (err < 0)
return err;
tbp = peer_tb;
rtnetlink: support specifying device flags on device creation commit e8469ed959c373c2ff9e6f488aa5a14971aebe1f Author: Patrick McHardy <kaber@trash.net> Date: Tue Feb 23 20:41:30 2010 +0100 Support specifying the initial device flags when creating a device though rtnl_link. Devices allocated by rtnl_create_link() are marked as INITIALIZING in order to surpress netlink registration notifications. To complete setup, rtnl_configure_link() must be called, which performs the device flag changes and invokes the deferred notifiers if everything went well. Two examples: # add macvlan to eth0 # $ ip link add link eth0 up allmulticast on type macvlan [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN link/ether 26:f8:84:02:f9:2a brd ff:ff:ff:ff:ff:ff [ROUTE]ff00::/8 dev macvlan0 table local metric 256 mtu 1500 advmss 1440 hoplimit 0 [ROUTE]fe80::/64 dev macvlan0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit 0 [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 link/ether 26:f8:84:02:f9:2a [ADDR]11: macvlan0 inet6 fe80::24f8:84ff:fe02:f92a/64 scope link valid_lft forever preferred_lft forever [ROUTE]local fe80::24f8:84ff:fe02:f92a via :: dev lo table local proto none metric 0 mtu 16436 advmss 16376 hoplimit 0 [ROUTE]default via fe80::215:e9ff:fef0:10f8 dev macvlan0 proto kernel metric 1024 mtu 1500 advmss 1440 hoplimit 0 [NEIGH]fe80::215:e9ff:fef0:10f8 dev macvlan0 lladdr 00:15:e9:f0:10:f8 router STALE [ROUTE]2001:6f8:974::/64 dev macvlan0 proto kernel metric 256 expires 0sec mtu 1500 advmss 1440 hoplimit 0 [PREFIX]prefix 2001:6f8:974::/64 dev macvlan0 onlink autoconf valid 14400 preferred 131084 [ADDR]11: macvlan0 inet6 2001:6f8:974:0:24f8:84ff:fe02:f92a/64 scope global dynamic valid_lft 86399sec preferred_lft 14399sec # add VLAN to eth1, eth1 is down # $ ip link add link eth1 up type vlan id 1000 RTNETLINK answers: Network is down <no events> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-02-26 06:34:54 +00:00
} else {
ifmp = NULL;
tbp = tb;
rtnetlink: support specifying device flags on device creation commit e8469ed959c373c2ff9e6f488aa5a14971aebe1f Author: Patrick McHardy <kaber@trash.net> Date: Tue Feb 23 20:41:30 2010 +0100 Support specifying the initial device flags when creating a device though rtnl_link. Devices allocated by rtnl_create_link() are marked as INITIALIZING in order to surpress netlink registration notifications. To complete setup, rtnl_configure_link() must be called, which performs the device flag changes and invokes the deferred notifiers if everything went well. Two examples: # add macvlan to eth0 # $ ip link add link eth0 up allmulticast on type macvlan [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN link/ether 26:f8:84:02:f9:2a brd ff:ff:ff:ff:ff:ff [ROUTE]ff00::/8 dev macvlan0 table local metric 256 mtu 1500 advmss 1440 hoplimit 0 [ROUTE]fe80::/64 dev macvlan0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit 0 [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 link/ether 26:f8:84:02:f9:2a [ADDR]11: macvlan0 inet6 fe80::24f8:84ff:fe02:f92a/64 scope link valid_lft forever preferred_lft forever [ROUTE]local fe80::24f8:84ff:fe02:f92a via :: dev lo table local proto none metric 0 mtu 16436 advmss 16376 hoplimit 0 [ROUTE]default via fe80::215:e9ff:fef0:10f8 dev macvlan0 proto kernel metric 1024 mtu 1500 advmss 1440 hoplimit 0 [NEIGH]fe80::215:e9ff:fef0:10f8 dev macvlan0 lladdr 00:15:e9:f0:10:f8 router STALE [ROUTE]2001:6f8:974::/64 dev macvlan0 proto kernel metric 256 expires 0sec mtu 1500 advmss 1440 hoplimit 0 [PREFIX]prefix 2001:6f8:974::/64 dev macvlan0 onlink autoconf valid 14400 preferred 131084 [ADDR]11: macvlan0 inet6 2001:6f8:974:0:24f8:84ff:fe02:f92a/64 scope global dynamic valid_lft 86399sec preferred_lft 14399sec # add VLAN to eth1, eth1 is down # $ ip link add link eth1 up type vlan id 1000 RTNETLINK answers: Network is down <no events> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-02-26 06:34:54 +00:00
}
if (ifmp && tbp[IFLA_IFNAME]) {
nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
name_assign_type = NET_NAME_USER;
} else {
snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
name_assign_type = NET_NAME_ENUM;
}
net = rtnl_link_get_net(src_net, tbp);
if (IS_ERR(net))
return PTR_ERR(net);
peer = rtnl_create_link(net, ifname, name_assign_type,
&veth_link_ops, tbp, extack);
if (IS_ERR(peer)) {
put_net(net);
return PTR_ERR(peer);
}
if (!ifmp || !tbp[IFLA_ADDRESS])
eth_hw_addr_random(peer);
if (ifmp && (dev->ifindex != 0))
peer->ifindex = ifmp->ifi_index;
netif_inherit_tso_max(peer, dev);
err = register_netdevice(peer);
put_net(net);
net = NULL;
if (err < 0)
goto err_register_peer;
/* keep GRO disabled by default to be consistent with the established
* veth behavior
*/
veth_disable_gro(peer);
netif_carrier_off(peer);
err = rtnl_configure_link(peer, ifmp, 0, NULL);
rtnetlink: support specifying device flags on device creation commit e8469ed959c373c2ff9e6f488aa5a14971aebe1f Author: Patrick McHardy <kaber@trash.net> Date: Tue Feb 23 20:41:30 2010 +0100 Support specifying the initial device flags when creating a device though rtnl_link. Devices allocated by rtnl_create_link() are marked as INITIALIZING in order to surpress netlink registration notifications. To complete setup, rtnl_configure_link() must be called, which performs the device flag changes and invokes the deferred notifiers if everything went well. Two examples: # add macvlan to eth0 # $ ip link add link eth0 up allmulticast on type macvlan [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN link/ether 26:f8:84:02:f9:2a brd ff:ff:ff:ff:ff:ff [ROUTE]ff00::/8 dev macvlan0 table local metric 256 mtu 1500 advmss 1440 hoplimit 0 [ROUTE]fe80::/64 dev macvlan0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit 0 [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 link/ether 26:f8:84:02:f9:2a [ADDR]11: macvlan0 inet6 fe80::24f8:84ff:fe02:f92a/64 scope link valid_lft forever preferred_lft forever [ROUTE]local fe80::24f8:84ff:fe02:f92a via :: dev lo table local proto none metric 0 mtu 16436 advmss 16376 hoplimit 0 [ROUTE]default via fe80::215:e9ff:fef0:10f8 dev macvlan0 proto kernel metric 1024 mtu 1500 advmss 1440 hoplimit 0 [NEIGH]fe80::215:e9ff:fef0:10f8 dev macvlan0 lladdr 00:15:e9:f0:10:f8 router STALE [ROUTE]2001:6f8:974::/64 dev macvlan0 proto kernel metric 256 expires 0sec mtu 1500 advmss 1440 hoplimit 0 [PREFIX]prefix 2001:6f8:974::/64 dev macvlan0 onlink autoconf valid 14400 preferred 131084 [ADDR]11: macvlan0 inet6 2001:6f8:974:0:24f8:84ff:fe02:f92a/64 scope global dynamic valid_lft 86399sec preferred_lft 14399sec # add VLAN to eth1, eth1 is down # $ ip link add link eth1 up type vlan id 1000 RTNETLINK answers: Network is down <no events> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-02-26 06:34:54 +00:00
if (err < 0)
goto err_configure_peer;
/*
* register dev last
*
* note, that since we've registered new device the dev's name
* should be re-allocated
*/
if (tb[IFLA_ADDRESS] == NULL)
eth_hw_addr_random(dev);
if (tb[IFLA_IFNAME])
nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
else
snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
err = register_netdevice(dev);
if (err < 0)
goto err_register_dev;
netif_carrier_off(dev);
/*
* tie the deviced together
*/
priv = netdev_priv(dev);
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
rcu_assign_pointer(priv->peer, peer);
err = veth_init_queues(dev, tb);
if (err)
goto err_queues;
priv = netdev_priv(peer);
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
rcu_assign_pointer(priv->peer, dev);
err = veth_init_queues(peer, tb);
if (err)
goto err_queues;
veth_disable_gro(dev);
/* update XDP supported features */
veth_set_xdp_features(dev);
veth_set_xdp_features(peer);
return 0;
err_queues:
unregister_netdevice(dev);
err_register_dev:
/* nothing to do */
rtnetlink: support specifying device flags on device creation commit e8469ed959c373c2ff9e6f488aa5a14971aebe1f Author: Patrick McHardy <kaber@trash.net> Date: Tue Feb 23 20:41:30 2010 +0100 Support specifying the initial device flags when creating a device though rtnl_link. Devices allocated by rtnl_create_link() are marked as INITIALIZING in order to surpress netlink registration notifications. To complete setup, rtnl_configure_link() must be called, which performs the device flag changes and invokes the deferred notifiers if everything went well. Two examples: # add macvlan to eth0 # $ ip link add link eth0 up allmulticast on type macvlan [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN link/ether 26:f8:84:02:f9:2a brd ff:ff:ff:ff:ff:ff [ROUTE]ff00::/8 dev macvlan0 table local metric 256 mtu 1500 advmss 1440 hoplimit 0 [ROUTE]fe80::/64 dev macvlan0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit 0 [LINK]11: macvlan0@eth0: <BROADCAST,MULTICAST,ALLMULTI,UP,LOWER_UP> mtu 1500 link/ether 26:f8:84:02:f9:2a [ADDR]11: macvlan0 inet6 fe80::24f8:84ff:fe02:f92a/64 scope link valid_lft forever preferred_lft forever [ROUTE]local fe80::24f8:84ff:fe02:f92a via :: dev lo table local proto none metric 0 mtu 16436 advmss 16376 hoplimit 0 [ROUTE]default via fe80::215:e9ff:fef0:10f8 dev macvlan0 proto kernel metric 1024 mtu 1500 advmss 1440 hoplimit 0 [NEIGH]fe80::215:e9ff:fef0:10f8 dev macvlan0 lladdr 00:15:e9:f0:10:f8 router STALE [ROUTE]2001:6f8:974::/64 dev macvlan0 proto kernel metric 256 expires 0sec mtu 1500 advmss 1440 hoplimit 0 [PREFIX]prefix 2001:6f8:974::/64 dev macvlan0 onlink autoconf valid 14400 preferred 131084 [ADDR]11: macvlan0 inet6 2001:6f8:974:0:24f8:84ff:fe02:f92a/64 scope global dynamic valid_lft 86399sec preferred_lft 14399sec # add VLAN to eth1, eth1 is down # $ ip link add link eth1 up type vlan id 1000 RTNETLINK answers: Network is down <no events> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-02-26 06:34:54 +00:00
err_configure_peer:
unregister_netdevice(peer);
return err;
err_register_peer:
free_netdev(peer);
return err;
}
static void veth_dellink(struct net_device *dev, struct list_head *head)
{
struct veth_priv *priv;
struct net_device *peer;
priv = netdev_priv(dev);
veth: avoid a NULL deref in veth_stats_one commit 2681128f0ced8a (veth: extend device features) added a NULL deref in veth_stats_one(), as veth_get_stats64() was not testing if the peer device was setup or not. At init time, we call dev_get_stats() before veth pair is fully setup. [ 178.854758] [<ffffffffa00f5677>] veth_get_stats64+0x47/0x70 [veth] [ 178.861013] [<ffffffff814f0a2d>] dev_get_stats+0x6d/0x130 [ 178.866486] [<ffffffff81504efc>] rtnl_fill_ifinfo+0x47c/0x930 [ 178.872299] [<ffffffff81505b93>] rtmsg_ifinfo+0x83/0x100 [ 178.877678] [<ffffffff81505cc6>] rtnl_configure_link+0x76/0xa0 [ 178.883580] [<ffffffffa00f52fa>] veth_newlink+0x16a/0x350 [veth] [ 178.889654] [<ffffffff815061cc>] rtnl_newlink+0x4dc/0x5e0 [ 178.895128] [<ffffffff81505e1e>] ? rtnl_newlink+0x12e/0x5e0 [ 178.900769] [<ffffffff8150587d>] rtnetlink_rcv_msg+0x11d/0x310 [ 178.906669] [<ffffffff81505760>] ? __rtnl_unlock+0x20/0x20 [ 178.912225] [<ffffffff81521f89>] netlink_rcv_skb+0xa9/0xd0 [ 178.917779] [<ffffffff81502d55>] rtnetlink_rcv+0x25/0x40 [ 178.923159] [<ffffffff815218d1>] netlink_unicast+0x1b1/0x230 [ 178.928887] [<ffffffff81521c4e>] netlink_sendmsg+0x2fe/0x3b0 [ 178.934615] [<ffffffff814dbe22>] sock_sendmsg+0xd2/0xf0 So we must check if peer was setup in veth_get_stats64() As pointed out by Ben Hutchings, priv->peer is missing proper synchronization. Adding RCU protection is a safe and well documented way to make sure we don't access about to be freed or already freed data. Reported-by: Tom Parkin <tparkin@katalix.com> Signed-off-by: Eric Dumazet <edumazet@google.com> CC: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-04 15:42:40 +00:00
peer = rtnl_dereference(priv->peer);
/* Note : dellink() is called from default_device_exit_batch(),
* before a rcu_synchronize() point. The devices are guaranteed
* not being freed before one RCU grace period.
*/
RCU_INIT_POINTER(priv->peer, NULL);
unregister_netdevice_queue(dev, head);
if (peer) {
priv = netdev_priv(peer);
RCU_INIT_POINTER(priv->peer, NULL);
unregister_netdevice_queue(peer, head);
}
}
static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
[VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) },
};
static struct net *veth_get_link_net(const struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer = rtnl_dereference(priv->peer);
return peer ? dev_net(peer) : dev_net(dev);
}
static unsigned int veth_get_num_queues(void)
{
/* enforce the same queue limit as rtnl_create_link */
int queues = num_possible_cpus();
if (queues > 4096)
queues = 4096;
return queues;
}
static struct rtnl_link_ops veth_link_ops = {
.kind = DRV_NAME,
.priv_size = sizeof(struct veth_priv),
.setup = veth_setup,
.validate = veth_validate,
.newlink = veth_newlink,
.dellink = veth_dellink,
.policy = veth_policy,
.maxtype = VETH_INFO_MAX,
.get_link_net = veth_get_link_net,
.get_num_tx_queues = veth_get_num_queues,
.get_num_rx_queues = veth_get_num_queues,
};
/*
* init/fini
*/
static __init int veth_init(void)
{
return rtnl_link_register(&veth_link_ops);
}
static __exit void veth_exit(void)
{
rtnl_link_unregister(&veth_link_ops);
}
module_init(veth_init);
module_exit(veth_exit);
MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_RTNL_LINK(DRV_NAME);