From 76fe372ccb81b0c89b6cd2fec26e2f38c958be85 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 22 Jul 2024 12:28:42 -0700 Subject: [PATCH 01/61] can: bcm: Remove proc entry when dev is unregistered. syzkaller reported a warning in bcm_connect() below. [0] The repro calls connect() to vxcan1, removes vxcan1, and calls connect() with ifindex == 0. Calling connect() for a BCM socket allocates a proc entry. Then, bcm_sk(sk)->bound is set to 1 to prevent further connect(). However, removing the bound device resets bcm_sk(sk)->bound to 0 in bcm_notify(). The 2nd connect() tries to allocate a proc entry with the same name and sets NULL to bcm_sk(sk)->bcm_proc_read, leaking the original proc entry. Since the proc entry is available only for connect()ed sockets, let's clean up the entry when the bound netdev is unregistered. [0]: proc_dir_entry 'can-bcm/2456' already registered WARNING: CPU: 1 PID: 394 at fs/proc/generic.c:376 proc_register+0x645/0x8f0 fs/proc/generic.c:375 Modules linked in: CPU: 1 PID: 394 Comm: syz-executor403 Not tainted 6.10.0-rc7-g852e42cc2dd4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 RIP: 0010:proc_register+0x645/0x8f0 fs/proc/generic.c:375 Code: 00 00 00 00 00 48 85 ed 0f 85 97 02 00 00 4d 85 f6 0f 85 9f 02 00 00 48 c7 c7 9b cb cf 87 48 89 de 4c 89 fa e8 1c 6f eb fe 90 <0f> 0b 90 90 48 c7 c7 98 37 99 89 e8 cb 7e 22 05 bb 00 00 00 10 48 RSP: 0018:ffa0000000cd7c30 EFLAGS: 00010246 RAX: 9e129be1950f0200 RBX: ff1100011b51582c RCX: ff1100011857cd80 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000002 RBP: 0000000000000000 R08: ffd400000000000f R09: ff1100013e78cac0 R10: ffac800000cd7980 R11: ff1100013e12b1f0 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: ff1100011a99a2ec FS: 00007fbd7086f740(0000) GS:ff1100013fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200071c0 CR3: 0000000118556004 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: proc_create_net_single+0x144/0x210 fs/proc/proc_net.c:220 bcm_connect+0x472/0x840 net/can/bcm.c:1673 __sys_connect_file net/socket.c:2049 [inline] __sys_connect+0x5d2/0x690 net/socket.c:2066 __do_sys_connect net/socket.c:2076 [inline] __se_sys_connect net/socket.c:2073 [inline] __x64_sys_connect+0x8f/0x100 net/socket.c:2073 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xd9/0x1c0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fbd708b0e5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 73 9f 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007fff8cd33f08 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007fbd708b0e5d RDX: 0000000000000010 RSI: 0000000020000040 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000040 R09: 0000000000000040 R10: 0000000000000040 R11: 0000000000000246 R12: 00007fff8cd34098 R13: 0000000000401280 R14: 0000000000406de8 R15: 00007fbd70ab9000 remove_proc_entry: removing non-empty directory 'net/can-bcm', leaking at least '2456' Fixes: ffd980f976e7 ("[CAN]: Add broadcast manager (bcm) protocol") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Link: https://lore.kernel.org/all/20240722192842.37421-1-kuniyu@amazon.com Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/can/bcm.c b/net/can/bcm.c index 27d5fcf0eac9..46d3ec3aa44b 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1470,6 +1470,10 @@ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, /* remove device reference, if this is our bound device */ if (bo->bound && bo->ifindex == dev->ifindex) { +#if IS_ENABLED(CONFIG_PROC_FS) + if (sock_net(sk)->can.bcmproc_dir && bo->bcm_proc_read) + remove_proc_entry(bo->procname, sock_net(sk)->can.bcmproc_dir); +#endif bo->bound = 0; bo->ifindex = 0; notify_enodev = 1; From 06d4ef3056a7ac31be331281bb7a6302ef5a7f8a Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 5 Aug 2024 15:01:58 +0100 Subject: [PATCH 02/61] can: m_can: Release irq on error in m_can_open It appears that the irq requested in m_can_open() may be leaked if an error subsequently occurs: if m_can_start() fails. Address this by calling free_irq in the unwind path for such cases. Flagged by Smatch. Compile tested only. Fixes: eaacfeaca7ad ("can: m_can: Call the RAM init directly from m_can_chip_config") Acked-by: Marc Kleine-Budde Signed-off-by: Simon Horman Link: https://lore.kernel.org/all/20240805-mcan-irq-v2-1-7154c0484819@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 7f63f866083e..cd83c8b5d4b1 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -2052,7 +2052,7 @@ static int m_can_open(struct net_device *dev) /* start the m_can controller */ err = m_can_start(dev); if (err) - goto exit_irq_fail; + goto exit_start_fail; if (!cdev->is_peripheral) napi_enable(&cdev->napi); @@ -2061,6 +2061,9 @@ static int m_can_open(struct net_device *dev) return 0; +exit_start_fail: + if (cdev->is_peripheral || dev->irq) + free_irq(dev->irq, dev); exit_irq_fail: if (cdev->is_peripheral) destroy_workqueue(cdev->tx_wq); From a651261ac74298535f6d6316ebe27beceb6b17b1 Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Mon, 5 Aug 2024 20:30:41 +0200 Subject: [PATCH 03/61] can: m_can: Reset coalescing during suspend/resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During resume the interrupts are limited to IR_RF0N and the chip keeps running. In this case if coalescing is enabled and active we may miss waterlevel interrupts during suspend. It is safer to reset the coalescing by stopping the timer and adding IR_RF0N | IR_TEFN to the interrupts. This is a theoratical issue and probably extremely rare. Cc: Martin Hundebøll Fixes: 4a94d7e31cf5 ("can: m_can: allow keeping the transceiver running in suspend") Signed-off-by: Markus Schneider-Pargmann Link: https://lore.kernel.org/all/20240805183047.305630-2-msp@baylibre.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index cd83c8b5d4b1..31991e2f343e 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -2430,12 +2430,15 @@ int m_can_class_suspend(struct device *dev) netif_device_detach(ndev); /* leave the chip running with rx interrupt enabled if it is - * used as a wake-up source. + * used as a wake-up source. Coalescing needs to be reset then, + * the timer is cancelled here, interrupts are done in resume. */ - if (cdev->pm_wake_source) + if (cdev->pm_wake_source) { + hrtimer_cancel(&cdev->hrtimer); m_can_write(cdev, M_CAN_IE, IR_RF0N); - else + } else { m_can_stop(ndev); + } m_can_clk_stop(cdev); } @@ -2465,6 +2468,13 @@ int m_can_class_resume(struct device *dev) return ret; if (cdev->pm_wake_source) { + /* Restore active interrupts but disable coalescing as + * we may have missed important waterlevel interrupts + * between suspend and resume. Timers are already + * stopped in suspend. Here we enable all interrupts + * again. + */ + cdev->active_interrupts |= IR_RF0N | IR_TEFN; m_can_write(cdev, M_CAN_IE, cdev->active_interrupts); } else { ret = m_can_start(ndev); From 6eff1cead75ff330bb33264424c1da6cc7179ab8 Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Mon, 5 Aug 2024 20:30:42 +0200 Subject: [PATCH 04/61] can: m_can: Remove coalesing disable in isr during suspend We don't need to disable coalescing when the interrupt handler executes while the chip is suspended. The coalescing is already reset during suspend. Fixes: 07f25091ca02 ("can: m_can: Implement receive coalescing") Signed-off-by: Markus Schneider-Pargmann Link: https://lore.kernel.org/all/20240805183047.305630-3-msp@baylibre.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 31991e2f343e..ba416c973e8d 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1223,10 +1223,8 @@ static irqreturn_t m_can_isr(int irq, void *dev_id) struct m_can_classdev *cdev = netdev_priv(dev); u32 ir; - if (pm_runtime_suspended(cdev->dev)) { - m_can_coalescing_disable(cdev); + if (pm_runtime_suspended(cdev->dev)) return IRQ_NONE; - } ir = m_can_read(cdev, M_CAN_IR); m_can_coalescing_update(cdev, ir); From 40e4552eeef0e3090a5988de15889795936fd38f Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Mon, 5 Aug 2024 20:30:43 +0200 Subject: [PATCH 05/61] can: m_can: Remove m_can_rx_peripheral indirection m_can_rx_peripheral() is a wrapper around m_can_rx_handler() that calls m_can_disable_all_interrupts() on error. The same handling for the same error path is done in m_can_isr() as well. So remove m_can_rx_peripheral() and do the call from m_can_isr() directly. Signed-off-by: Markus Schneider-Pargmann Link: https://lore.kernel.org/all/20240805183047.305630-4-msp@baylibre.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index ba416c973e8d..a37ed376de9b 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1037,22 +1037,6 @@ end: return work_done; } -static int m_can_rx_peripheral(struct net_device *dev, u32 irqstatus) -{ - struct m_can_classdev *cdev = netdev_priv(dev); - int work_done; - - work_done = m_can_rx_handler(dev, NAPI_POLL_WEIGHT, irqstatus); - - /* Don't re-enable interrupts if the driver had a fatal error - * (e.g., FIFO read failure). - */ - if (work_done < 0) - m_can_disable_all_interrupts(cdev); - - return work_done; -} - static int m_can_poll(struct napi_struct *napi, int quota) { struct net_device *dev = napi->dev; @@ -1250,7 +1234,7 @@ static irqreturn_t m_can_isr(int irq, void *dev_id) } else { int pkts; - pkts = m_can_rx_peripheral(dev, ir); + pkts = m_can_rx_handler(dev, NAPI_POLL_WEIGHT, ir); if (pkts < 0) goto out_fail; } From 4d5159bfafa8d1a205d8213b7434e0402588b9ed Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Mon, 5 Aug 2024 20:30:44 +0200 Subject: [PATCH 06/61] can: m_can: Do not cancel timer from within timer On setups without interrupts, the interrupt handler is called from a timer callback. For non-peripheral receives napi is scheduled, interrupts are disabled and the timer is canceled with a blocking call. In case of an error this can happen as well. Check if napi is scheduled in the timer callback after the interrupt handler executed. If napi is scheduled, the timer is disabled. It will be reenabled by m_can_poll(). Return error values from the interrupt handler so that interrupt threads and timer callback can deal differently with it. In case of the timer we only disable the timer. The rest will be done when stopping the interface. Fixes: b382380c0d2d ("can: m_can: Add hrtimer to generate software interrupt") Fixes: a163c5761019 ("can: m_can: Start/Cancel polling timer together with interrupts") Signed-off-by: Markus Schneider-Pargmann Link: https://lore.kernel.org/all/20240805183047.305630-5-msp@baylibre.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 57 ++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index a37ed376de9b..5228304779f1 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -487,7 +487,7 @@ static inline void m_can_disable_all_interrupts(struct m_can_classdev *cdev) if (!cdev->net->irq) { dev_dbg(cdev->dev, "Stop hrtimer\n"); - hrtimer_cancel(&cdev->hrtimer); + hrtimer_try_to_cancel(&cdev->hrtimer); } } @@ -1201,11 +1201,15 @@ static void m_can_coalescing_update(struct m_can_classdev *cdev, u32 ir) HRTIMER_MODE_REL); } -static irqreturn_t m_can_isr(int irq, void *dev_id) +/* This interrupt handler is called either from the interrupt thread or a + * hrtimer. This has implications like cancelling a timer won't be possible + * blocking. + */ +static int m_can_interrupt_handler(struct m_can_classdev *cdev) { - struct net_device *dev = (struct net_device *)dev_id; - struct m_can_classdev *cdev = netdev_priv(dev); + struct net_device *dev = cdev->net; u32 ir; + int ret; if (pm_runtime_suspended(cdev->dev)) return IRQ_NONE; @@ -1232,11 +1236,9 @@ static irqreturn_t m_can_isr(int irq, void *dev_id) m_can_disable_all_interrupts(cdev); napi_schedule(&cdev->napi); } else { - int pkts; - - pkts = m_can_rx_handler(dev, NAPI_POLL_WEIGHT, ir); - if (pkts < 0) - goto out_fail; + ret = m_can_rx_handler(dev, NAPI_POLL_WEIGHT, ir); + if (ret < 0) + return ret; } } @@ -1254,8 +1256,9 @@ static irqreturn_t m_can_isr(int irq, void *dev_id) } else { if (ir & (IR_TEFN | IR_TEFW)) { /* New TX FIFO Element arrived */ - if (m_can_echo_tx_event(dev) != 0) - goto out_fail; + ret = m_can_echo_tx_event(dev); + if (ret != 0) + return ret; } } @@ -1263,16 +1266,31 @@ static irqreturn_t m_can_isr(int irq, void *dev_id) can_rx_offload_threaded_irq_finish(&cdev->offload); return IRQ_HANDLED; +} -out_fail: - m_can_disable_all_interrupts(cdev); - return IRQ_HANDLED; +static irqreturn_t m_can_isr(int irq, void *dev_id) +{ + struct net_device *dev = (struct net_device *)dev_id; + struct m_can_classdev *cdev = netdev_priv(dev); + int ret; + + ret = m_can_interrupt_handler(cdev); + if (ret < 0) { + m_can_disable_all_interrupts(cdev); + return IRQ_HANDLED; + } + + return ret; } static enum hrtimer_restart m_can_coalescing_timer(struct hrtimer *timer) { struct m_can_classdev *cdev = container_of(timer, struct m_can_classdev, hrtimer); + if (cdev->can.state == CAN_STATE_BUS_OFF || + cdev->can.state == CAN_STATE_STOPPED) + return HRTIMER_NORESTART; + irq_wake_thread(cdev->net->irq, cdev->net); return HRTIMER_NORESTART; @@ -1973,8 +1991,17 @@ static enum hrtimer_restart hrtimer_callback(struct hrtimer *timer) { struct m_can_classdev *cdev = container_of(timer, struct m_can_classdev, hrtimer); + int ret; - m_can_isr(0, cdev->net); + if (cdev->can.state == CAN_STATE_BUS_OFF || + cdev->can.state == CAN_STATE_STOPPED) + return HRTIMER_NORESTART; + + ret = m_can_interrupt_handler(cdev); + + /* On error or if napi is scheduled to read, stop the timer */ + if (ret < 0 || napi_is_scheduled(&cdev->napi)) + return HRTIMER_NORESTART; hrtimer_forward_now(timer, ms_to_ktime(HRTIMER_POLL_INTERVAL_MS)); From a572fea86c9b06cd3e6e89d79d565b52cb7e7cff Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Mon, 5 Aug 2024 20:30:45 +0200 Subject: [PATCH 07/61] can: m_can: disable_all_interrupts, not clear active_interrupts active_interrupts is a cache for the enabled interrupts and not the global masking of interrupts. Do not clear this variable otherwise we may loose the state of the interrupts. Fixes: 07f25091ca02 ("can: m_can: Implement receive coalescing") Signed-off-by: Markus Schneider-Pargmann Link: https://lore.kernel.org/all/20240805183047.305630-6-msp@baylibre.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 5228304779f1..68bd4a00ecca 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -483,7 +483,6 @@ static inline void m_can_disable_all_interrupts(struct m_can_classdev *cdev) { m_can_coalescing_disable(cdev); m_can_write(cdev, M_CAN_ILE, 0x0); - cdev->active_interrupts = 0x0; if (!cdev->net->irq) { dev_dbg(cdev->dev, "Stop hrtimer\n"); From 733dbf556cd5b71d5e6f6aa7a93f117b438ab785 Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Mon, 5 Aug 2024 20:30:46 +0200 Subject: [PATCH 08/61] can: m_can: Reset cached active_interrupts on start To force writing the enabled interrupts, reset the active_interrupts cache. Fixes: 07f25091ca02 ("can: m_can: Implement receive coalescing") Signed-off-by: Markus Schneider-Pargmann Link: https://lore.kernel.org/all/20240805183047.305630-7-msp@baylibre.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 68bd4a00ecca..67c4c740c416 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1541,6 +1541,7 @@ static int m_can_chip_config(struct net_device *dev) else interrupts &= ~(IR_ERR_LEC_31X); } + cdev->active_interrupts = 0; m_can_interrupt_enable(cdev, interrupts); /* route all interrupts to INT0 */ From e443d15b949952ee039b731d5c35bcbafa300024 Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Mon, 5 Aug 2024 20:30:47 +0200 Subject: [PATCH 09/61] can: m_can: Limit coalescing to peripheral instances The use of coalescing for non-peripheral chips in the current implementation is limited to non-existing. Disable the possibility to set coalescing through ethtool. Signed-off-by: Markus Schneider-Pargmann Link: https://lore.kernel.org/all/20240805183047.305630-8-msp@baylibre.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 67c4c740c416..012c3d22b01d 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -2184,7 +2184,7 @@ static int m_can_set_coalesce(struct net_device *dev, return 0; } -static const struct ethtool_ops m_can_ethtool_ops = { +static const struct ethtool_ops m_can_ethtool_ops_coalescing = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS_IRQ | ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ | ETHTOOL_COALESCE_TX_USECS_IRQ | @@ -2195,18 +2195,20 @@ static const struct ethtool_ops m_can_ethtool_ops = { .set_coalesce = m_can_set_coalesce, }; -static const struct ethtool_ops m_can_ethtool_ops_polling = { +static const struct ethtool_ops m_can_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; -static int register_m_can_dev(struct net_device *dev) +static int register_m_can_dev(struct m_can_classdev *cdev) { + struct net_device *dev = cdev->net; + dev->flags |= IFF_ECHO; /* we support local echo */ dev->netdev_ops = &m_can_netdev_ops; - if (dev->irq) - dev->ethtool_ops = &m_can_ethtool_ops; + if (dev->irq && cdev->is_peripheral) + dev->ethtool_ops = &m_can_ethtool_ops_coalescing; else - dev->ethtool_ops = &m_can_ethtool_ops_polling; + dev->ethtool_ops = &m_can_ethtool_ops; return register_candev(dev); } @@ -2392,7 +2394,7 @@ int m_can_class_register(struct m_can_classdev *cdev) if (ret) goto rx_offload_del; - ret = register_m_can_dev(cdev->net); + ret = register_m_can_dev(cdev); if (ret) { dev_err(cdev->dev, "registering %s failed (err=%d)\n", cdev->net->name, ret); From 50ea5449c56310d2d31c28ba91a59232116d3c1e Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 5 Jul 2024 17:28:27 +0200 Subject: [PATCH 10/61] can: mcp251xfd: fix ring configuration when switching from CAN-CC to CAN-FD mode If the ring (rx, tx) and/or coalescing parameters (rx-frames-irq, tx-frames-irq) have been configured while the interface was in CAN-CC mode, but the interface is brought up in CAN-FD mode, the ring parameters might be too big. Use the default CAN-FD values in this case. Fixes: 9263c2e92be9 ("can: mcp251xfd: ring: add support for runtime configurable RX/TX ring parameters") Link: https://lore.kernel.org/all/20240805-mcp251xfd-fix-ringconfig-v1-1-72086f0ca5ee@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-ram.c | 11 +++++++++- .../net/can/spi/mcp251xfd/mcp251xfd-ring.c | 20 ++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ram.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ram.c index 9e8e82cdba46..61b0d6fa52dd 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ram.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ram.c @@ -97,7 +97,16 @@ void can_ram_get_layout(struct can_ram_layout *layout, if (ring) { u8 num_rx_coalesce = 0, num_tx_coalesce = 0; - num_rx = can_ram_rounddown_pow_of_two(config, &config->rx, 0, ring->rx_pending); + /* If the ring parameters have been configured in + * CAN-CC mode, but and we are in CAN-FD mode now, + * they might be to big. Use the default CAN-FD values + * in this case. + */ + num_rx = ring->rx_pending; + if (num_rx > layout->max_rx) + num_rx = layout->default_rx; + + num_rx = can_ram_rounddown_pow_of_two(config, &config->rx, 0, num_rx); /* The ethtool doc says: * To disable coalescing, set usecs = 0 and max_frames = 1. diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c index 7bd2bcb5cf87..f72582d4d3e8 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c @@ -469,11 +469,25 @@ int mcp251xfd_ring_alloc(struct mcp251xfd_priv *priv) /* switching from CAN-2.0 to CAN-FD mode or vice versa */ if (fd_mode != test_bit(MCP251XFD_FLAGS_FD_MODE, priv->flags)) { + const struct ethtool_ringparam ring = { + .rx_pending = priv->rx_obj_num, + .tx_pending = priv->tx->obj_num, + }; + const struct ethtool_coalesce ec = { + .rx_coalesce_usecs_irq = priv->rx_coalesce_usecs_irq, + .rx_max_coalesced_frames_irq = priv->rx_obj_num_coalesce_irq, + .tx_coalesce_usecs_irq = priv->tx_coalesce_usecs_irq, + .tx_max_coalesced_frames_irq = priv->tx_obj_num_coalesce_irq, + }; struct can_ram_layout layout; - can_ram_get_layout(&layout, &mcp251xfd_ram_config, NULL, NULL, fd_mode); - priv->rx_obj_num = layout.default_rx; - tx_ring->obj_num = layout.default_tx; + can_ram_get_layout(&layout, &mcp251xfd_ram_config, &ring, &ec, fd_mode); + + priv->rx_obj_num = layout.cur_rx; + priv->rx_obj_num_coalesce_irq = layout.rx_coalesce; + + tx_ring->obj_num = layout.cur_tx; + priv->tx_obj_num_coalesce_irq = layout.tx_coalesce; } if (fd_mode) { From ac2b81eb8b2d104033560daea886ee84531e3d0a Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 5 Jul 2024 17:24:42 +0200 Subject: [PATCH 11/61] can: mcp251xfd: mcp251xfd_ring_init(): check TX-coalescing configuration When changing the interface from CAN-CC to CAN-FD mode the old coalescing parameters are re-used. This might cause problem, as the configured parameters are too big for CAN-FD mode. During testing an invalid TX coalescing configuration has been seen. The problem should be been fixed in the previous patch, but add a safeguard here to ensure that the number of TEF coalescing buffers (if configured) is exactly the half of all TEF buffers. Link: https://lore.kernel.org/all/20240805-mcp251xfd-fix-ringconfig-v1-2-72086f0ca5ee@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c index f72582d4d3e8..83c18035b2a2 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c @@ -290,7 +290,7 @@ int mcp251xfd_ring_init(struct mcp251xfd_priv *priv) const struct mcp251xfd_rx_ring *rx_ring; u16 base = 0, ram_used; u8 fifo_nr = 1; - int i; + int err = 0, i; netdev_reset_queue(priv->ndev); @@ -386,10 +386,18 @@ int mcp251xfd_ring_init(struct mcp251xfd_priv *priv) netdev_err(priv->ndev, "Error during ring configuration, using more RAM (%u bytes) than available (%u bytes).\n", ram_used, MCP251XFD_RAM_SIZE); - return -ENOMEM; + err = -ENOMEM; } - return 0; + if (priv->tx_obj_num_coalesce_irq && + priv->tx_obj_num_coalesce_irq * 2 != priv->tx->obj_num) { + netdev_err(priv->ndev, + "Error during ring configuration, number of TEF coalescing buffers (%u) must be half of TEF buffers (%u).\n", + priv->tx_obj_num_coalesce_irq, priv->tx->obj_num); + err = -EINVAL; + } + + return err; } void mcp251xfd_ring_free(struct mcp251xfd_priv *priv) From 7dd9c26bd6cf679bcfdef01a8659791aa6487a29 Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Thu, 22 Aug 2024 08:25:07 +0100 Subject: [PATCH 12/61] can: mcp251x: fix deadlock if an interrupt occurs during mcp251x_open The mcp251x_hw_wake() function is called with the mpc_lock mutex held and disables the interrupt handler so that no interrupts can be processed while waking the device. If an interrupt has already occurred then waiting for the interrupt handler to complete will deadlock because it will be trying to acquire the same mutex. CPU0 CPU1 ---- ---- mcp251x_open() mutex_lock(&priv->mcp_lock) request_threaded_irq() mcp251x_can_ist() mutex_lock(&priv->mcp_lock) mcp251x_hw_wake() disable_irq() <-- deadlock Use disable_irq_nosync() instead because the interrupt handler does everything while holding the mutex so it doesn't matter if it's still running. Fixes: 8ce8c0abcba3 ("can: mcp251x: only reset hardware as required") Signed-off-by: Simon Arlott Reviewed-by: Przemek Kitszel Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/4fc08687-1d80-43fe-9f0d-8ef8475e75f6@0882a8b5-c6c3-11e9-b005-00805fc181fe.uuid.home.arpa Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c index 3b8736ff0345..ec5c64006a16 100644 --- a/drivers/net/can/spi/mcp251x.c +++ b/drivers/net/can/spi/mcp251x.c @@ -752,7 +752,7 @@ static int mcp251x_hw_wake(struct spi_device *spi) int ret; /* Force wakeup interrupt to wake device, but don't execute IST */ - disable_irq(spi->irq); + disable_irq_nosync(spi->irq); mcp251x_write_2regs(spi, CANINTE, CANINTE_WAKIE, CANINTF_WAKIF); /* Wait for oscillator startup timer after wake up */ From 9abf199943a6469a71f6ce5c2266e9364d310f8b Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Tue, 13 Aug 2024 16:38:08 +0800 Subject: [PATCH 13/61] wifi: ath11k: fix NULL pointer dereference in ath11k_mac_get_eirp_power() Commit 39dc8b8ea387 ("wifi: mac80211: pass parsed TPE data to drivers") breaks ath11k, leading to kernel crash: BUG: kernel NULL pointer dereference, address: 0000000000000018 RIP: 0010:ath11k_mac_get_eirp_power.isra.0+0x5b/0x80 [ath11k] Call Trace: ath11k_mac_fill_reg_tpc_info+0x3d6/0x800 [ath11k] ath11k_mac_vdev_start_restart+0x412/0x4d0 [ath11k] ath11k_mac_op_sta_state+0x7bc/0xbb0 [ath11k] drv_sta_state+0xf1/0x5f0 [mac80211] sta_info_insert_rcu+0x28d/0x530 [mac80211] sta_info_insert+0xf/0x20 [mac80211] ieee80211_prep_connection+0x3b4/0x4c0 [mac80211] ieee80211_mgd_auth+0x363/0x600 [mac80211] The issue scenario is, AP advertises power spectral density (PSD) values in its transmit power envelope (TPE) IE and supports 160 MHz bandwidth in 6 GHz. When connecting to this AP, in ath11k_mac_parse_tx_pwr_env(), the local variable psd is true and then reg_tpc_info.num_pwr_levels is set to 8 due to 160 MHz bandwidth. Note here ath11k fails to set reg_tpc_info.is_psd_power as TRUE due to above commit. Then in ath11k_mac_fill_reg_tpc_info(), for each of the 8 power levels, for a PSD channel, ath11k_mac_get_psd_channel() is expected to be called to get required information. However due to invalid reg_tpc_info.is_psd_power, it is ath11k_mac_get_eirp_power() that gets called and passed with pwr_lvl_idx as one of the arguments. Note this function implicitly requires pwr_lvl_idx to be no more than 3. So when pwr_lvl_idx is larger than that ath11k_mac_get_seg_freq() returns invalid center frequency, with which as the input ieee80211_get_channel() returns NULL, then kernel crashes due to NULL pointer dereference. Fix it by setting reg_tpc_info.is_psd_power properly. Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3.6510.30 Fixes: 39dc8b8ea387 ("wifi: mac80211: pass parsed TPE data to drivers") Reported-by: Mikko Tiihonen Tested-by: Mikko Tiihonen Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219131 Signed-off-by: Baochen Qiang Acked-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240813083808.9224-1-quic_bqiang@quicinc.com --- drivers/net/wireless/ath/ath11k/mac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index ba910ae2c676..7c0ef6916dd2 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -7900,6 +7900,7 @@ static void ath11k_mac_parse_tx_pwr_env(struct ath11k *ar, } if (psd) { + arvif->reg_tpc_info.is_psd_power = true; arvif->reg_tpc_info.num_pwr_levels = psd->count; for (i = 0; i < arvif->reg_tpc_info.num_pwr_levels; i++) { From ba8cf80724dbc09825b52498e4efacb563935408 Mon Sep 17 00:00:00 2001 From: Daiwei Li Date: Tue, 13 Aug 2024 21:55:53 -0700 Subject: [PATCH 14/61] igb: Fix not clearing TimeSync interrupts for 82580 82580 NICs have a hardware bug that makes it necessary to write into the TSICR (TimeSync Interrupt Cause) register to clear it: https://lore.kernel.org/all/CDCB8BE0.1EC2C%25matthew.vick@intel.com/ Add a conditional so only for 82580 we write into the TSICR register, so we don't risk losing events for other models. Without this change, when running ptp4l with an Intel 82580 card, I get the following output: > timed out while polling for tx timestamp increasing tx_timestamp_timeout or > increasing kworker priority may correct this issue, but a driver bug likely > causes it This goes away with this change. This (partially) reverts commit ee14cc9ea19b ("igb: Fix missing time sync events"). Fixes: ee14cc9ea19b ("igb: Fix missing time sync events") Closes: https://lore.kernel.org/intel-wired-lan/CAN0jFd1kO0MMtOh8N2Ztxn6f7vvDKp2h507sMryobkBKe=xk=w@mail.gmail.com/ Tested-by: Daiwei Li Suggested-by: Vinicius Costa Gomes Signed-off-by: Daiwei Li Acked-by: Vinicius Costa Gomes Reviewed-by: Kurt Kanzenbach Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 33a42b4c21e0..9dc7c60838ed 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6960,10 +6960,20 @@ static void igb_extts(struct igb_adapter *adapter, int tsintr_tt) static void igb_tsync_interrupt(struct igb_adapter *adapter) { + const u32 mask = (TSINTR_SYS_WRAP | E1000_TSICR_TXTS | + TSINTR_TT0 | TSINTR_TT1 | + TSINTR_AUTT0 | TSINTR_AUTT1); struct e1000_hw *hw = &adapter->hw; u32 tsicr = rd32(E1000_TSICR); struct ptp_clock_event event; + if (hw->mac.type == e1000_82580) { + /* 82580 has a hardware bug that requires an explicit + * write to clear the TimeSync interrupt cause. + */ + wr32(E1000_TSICR, tsicr & mask); + } + if (tsicr & TSINTR_SYS_WRAP) { event.type = PTP_CLOCK_PPS; if (adapter->ptp_caps.pps) From d11a67634227f9f9da51938af085fb41a733848f Mon Sep 17 00:00:00 2001 From: Dawid Osuchowski Date: Wed, 21 Aug 2024 18:06:40 +0200 Subject: [PATCH 15/61] ice: Add netif_device_attach/detach into PF reset flow Ethtool callbacks can be executed while reset is in progress and try to access deleted resources, e.g. getting coalesce settings can result in a NULL pointer dereference seen below. Reproduction steps: Once the driver is fully initialized, trigger reset: # echo 1 > /sys/class/net//device/reset when reset is in progress try to get coalesce settings using ethtool: # ethtool -c BUG: kernel NULL pointer dereference, address: 0000000000000020 PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP PTI CPU: 11 PID: 19713 Comm: ethtool Tainted: G S 6.10.0-rc7+ #7 RIP: 0010:ice_get_q_coalesce+0x2e/0xa0 [ice] RSP: 0018:ffffbab1e9bcf6a8 EFLAGS: 00010206 RAX: 000000000000000c RBX: ffff94512305b028 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9451c3f2e588 RDI: ffff9451c3f2e588 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: ffff9451c3f2e580 R11: 000000000000001f R12: ffff945121fa9000 R13: ffffbab1e9bcf760 R14: 0000000000000013 R15: ffffffff9e65dd40 FS: 00007faee5fbe740(0000) GS:ffff94546fd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000020 CR3: 0000000106c2e005 CR4: 00000000001706f0 Call Trace: ice_get_coalesce+0x17/0x30 [ice] coalesce_prepare_data+0x61/0x80 ethnl_default_doit+0xde/0x340 genl_family_rcv_msg_doit+0xf2/0x150 genl_rcv_msg+0x1b3/0x2c0 netlink_rcv_skb+0x5b/0x110 genl_rcv+0x28/0x40 netlink_unicast+0x19c/0x290 netlink_sendmsg+0x222/0x490 __sys_sendto+0x1df/0x1f0 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x82/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7faee60d8e27 Calling netif_device_detach() before reset makes the net core not call the driver when ethtool command is issued, the attempt to execute an ethtool command during reset will result in the following message: netlink error: No such device instead of NULL pointer dereference. Once reset is done and ice_rebuild() is executing, the netif_device_attach() is called to allow for ethtool operations to occur again in a safe manner. Fixes: fcea6f3da546 ("ice: Add stats and ethtool support") Suggested-by: Jakub Kicinski Reviewed-by: Igor Bagnucki Signed-off-by: Dawid Osuchowski Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Reviewed-by: Michal Schmidt Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 6f97ed471fe9..46d3c5a34d6a 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -608,6 +608,9 @@ ice_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type) memset(&vsi->mqprio_qopt, 0, sizeof(vsi->mqprio_qopt)); } } + + if (vsi->netdev) + netif_device_detach(vsi->netdev); skip: /* clear SW filtering DB */ @@ -7589,6 +7592,7 @@ static void ice_update_pf_netdev_link(struct ice_pf *pf) */ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) { + struct ice_vsi *vsi = ice_get_main_vsi(pf); struct device *dev = ice_pf_to_dev(pf); struct ice_hw *hw = &pf->hw; bool dvm; @@ -7731,6 +7735,9 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) ice_rebuild_arfs(pf); } + if (vsi && vsi->netdev) + netif_device_attach(vsi->netdev); + ice_update_pf_netdev_link(pf); /* tell the firmware we are up */ From b57d643a673ce54bc1437d1cca25e1909f553a7e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 28 Aug 2024 10:58:21 -0700 Subject: [PATCH 16/61] MAINTAINERS: exclude bluetooth and wireless DT bindings from netdev ML We exclude wireless drivers from the netdev@ traffic, to delegate it to linux-wireless@, and avoid overwhelming netdev@. Bluetooth drivers are implicitly excluded because they live under drivers/bluetooth, not drivers/net. In both cases DT bindings sit under Documentation/devicetree/bindings/net/ and aren't excluded. So if a patch series touches DT bindings netdev@ ends up getting CCed, and these are usually fairly boring series. Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240828175821.2960423-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a70b7c9c3533..5e5e85841a2f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15892,6 +15892,8 @@ F: include/uapi/linux/ethtool_netlink.h F: include/uapi/linux/if_* F: include/uapi/linux/netdev* F: tools/testing/selftests/drivers/net/ +X: Documentation/devicetree/bindings/net/bluetooth/ +X: Documentation/devicetree/bindings/net/wireless/ X: drivers/net/wireless/ NETWORKING DRIVERS (WIRELESS) From 98d4435efcbf37801a3246fb53856c4b934a2613 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Thu, 29 Aug 2024 12:56:48 +0900 Subject: [PATCH 17/61] net/smc: prevent NULL pointer dereference in txopt_get Since smc_inet6_prot does not initialize ipv6_pinfo_offset, inet6_create() copies an incorrect address value, sk + 0 (offset), to inet_sk(sk)->pinet6. In addition, since inet_sk(sk)->pinet6 and smc_sk(sk)->clcsock practically point to the same address, when smc_create_clcsk() stores the newly created clcsock in smc_sk(sk)->clcsock, inet_sk(sk)->pinet6 is corrupted into clcsock. This causes NULL pointer dereference and various other memory corruptions. To solve this problem, you need to initialize ipv6_pinfo_offset, add a smc6_sock structure, and then add ipv6_pinfo as the second member of the smc_sock structure. Reported-by: syzkaller Fixes: d25a92ccae6b ("net/smc: Introduce IPPROTO_SMC") Signed-off-by: Jeongjun Park Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/smc/smc.h | 3 +++ net/smc/smc_inet.c | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 34b781e463c4..ad77d6b6b8d3 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -284,6 +284,9 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6_pinfo *pinet6; +#endif struct socket *clcsock; /* internal tcp socket */ void (*clcsk_state_change)(struct sock *sk); /* original stat_change fct. */ diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c index bece346dd8e9..a5b2041600f9 100644 --- a/net/smc/smc_inet.c +++ b/net/smc/smc_inet.c @@ -60,6 +60,11 @@ static struct inet_protosw smc_inet_protosw = { }; #if IS_ENABLED(CONFIG_IPV6) +struct smc6_sock { + struct smc_sock smc; + struct ipv6_pinfo inet6; +}; + static struct proto smc_inet6_prot = { .name = "INET6_SMC", .owner = THIS_MODULE, @@ -67,9 +72,10 @@ static struct proto smc_inet6_prot = { .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, - .obj_size = sizeof(struct smc_sock), + .obj_size = sizeof(struct smc6_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, + .ipv6_pinfo_offset = offsetof(struct smc6_sock, inet6), }; static const struct proto_ops smc_inet6_stream_ops = { From fe1910f9337bd46a9343967b547ccab26b4b2c6e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 20 Aug 2024 20:07:44 -0700 Subject: [PATCH 18/61] tcp_bpf: fix return value of tcp_bpf_sendmsg() When we cork messages in psock->cork, the last message triggers the flushing will result in sending a sk_msg larger than the current message size. In this case, in tcp_bpf_send_verdict(), 'copied' becomes negative at least in the following case: 468 case __SK_DROP: 469 default: 470 sk_msg_free_partial(sk, msg, tosend); 471 sk_msg_apply_bytes(psock, tosend); 472 *copied -= (tosend + delta); // <==== HERE 473 return -EACCES; Therefore, it could lead to the following BUG with a proper value of 'copied' (thanks to syzbot). We should not use negative 'copied' as a return value here. ------------[ cut here ]------------ kernel BUG at net/socket.c:733! Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP Modules linked in: CPU: 0 UID: 0 PID: 3265 Comm: syz-executor510 Not tainted 6.11.0-rc3-syzkaller-00060-gd07b43284ab3 #0 Hardware name: linux,dummy-virt (DT) pstate: 61400009 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : sock_sendmsg_nosec net/socket.c:733 [inline] pc : sock_sendmsg_nosec net/socket.c:728 [inline] pc : __sock_sendmsg+0x5c/0x60 net/socket.c:745 lr : sock_sendmsg_nosec net/socket.c:730 [inline] lr : __sock_sendmsg+0x54/0x60 net/socket.c:745 sp : ffff800088ea3b30 x29: ffff800088ea3b30 x28: fbf00000062bc900 x27: 0000000000000000 x26: ffff800088ea3bc0 x25: ffff800088ea3bc0 x24: 0000000000000000 x23: f9f00000048dc000 x22: 0000000000000000 x21: ffff800088ea3d90 x20: f9f00000048dc000 x19: ffff800088ea3d90 x18: 0000000000000001 x17: 0000000000000000 x16: 0000000000000000 x15: 000000002002ffaf x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: ffff8000815849c0 x9 : ffff8000815b49c0 x8 : 0000000000000000 x7 : 000000000000003f x6 : 0000000000000000 x5 : 00000000000007e0 x4 : fff07ffffd239000 x3 : fbf00000062bc900 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 00000000fffffdef Call trace: sock_sendmsg_nosec net/socket.c:733 [inline] __sock_sendmsg+0x5c/0x60 net/socket.c:745 ____sys_sendmsg+0x274/0x2ac net/socket.c:2597 ___sys_sendmsg+0xac/0x100 net/socket.c:2651 __sys_sendmsg+0x84/0xe0 net/socket.c:2680 __do_sys_sendmsg net/socket.c:2689 [inline] __se_sys_sendmsg net/socket.c:2687 [inline] __arm64_sys_sendmsg+0x24/0x30 net/socket.c:2687 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x48/0x110 arch/arm64/kernel/syscall.c:49 el0_svc_common.constprop.0+0x40/0xe0 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x1c/0x28 arch/arm64/kernel/syscall.c:151 el0_svc+0x34/0xec arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x100/0x12c arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x19c/0x1a0 arch/arm64/kernel/entry.S:598 Code: f9404463 d63f0060 3108441f 54fffe81 (d4210000) ---[ end trace 0000000000000000 ]--- Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Reported-by: syzbot+58c03971700330ce14d8@syzkaller.appspotmail.com Cc: Jakub Sitnicki Signed-off-by: Cong Wang Reviewed-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20240821030744.320934-1-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 53b0d62fd2c2..fe6178715ba0 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -577,7 +577,7 @@ out_err: err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); sk_psock_put(sk, psock); - return copied ? copied : err; + return copied > 0 ? copied : err; } enum { From dd885d90c047dbdd2773c1d33954cbd8747d81e2 Mon Sep 17 00:00:00 2001 From: Martin Jocic Date: Fri, 30 Aug 2024 17:31:13 +0200 Subject: [PATCH 19/61] can: kvaser_pciefd: Use a single write when releasing RX buffers Kvaser's PCIe cards uses the KCAN FPGA IP block which has dual 4K buffers for incoming messages shared by all (currently up to eight) channels. While the driver processes messages in one buffer, new incoming messages are stored in the other and so on. The design of KCAN is such that a buffer must be fully read and then released. Releasing a buffer will make the FPGA switch buffers. If the other buffer contains at least one incoming message the FPGA will also instantly issue a new interrupt, if not the interrupt will be issued after receiving the first new message. With IRQx interrupts, it takes a little time for the interrupt to happen, enough for any previous ISR call to do it's business and return, but MSI interrupts are way faster so this time is reduced to almost nothing. So with MSI, releasing the buffer HAS to be the very last action of the ISR before returning, otherwise the new interrupt might be "masked" by the kernel because the previous ISR call hasn't returned. And the interrupts are edge-triggered so we cannot loose one, or the ping-pong reading process will stop. This is why this patch modifies the driver to use a single write to the SRB_CMD register before returning. Signed-off-by: Martin Jocic Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20240830153113.2081440-1-martin.jocic@kvaser.com Fixes: 26ad340e582d ("can: kvaser_pciefd: Add driver for Kvaser PCIEcan devices") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/kvaser_pciefd.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c index a60d9efd5f8d..9ffc3ffb4e8f 100644 --- a/drivers/net/can/kvaser_pciefd.c +++ b/drivers/net/can/kvaser_pciefd.c @@ -1686,6 +1686,7 @@ static irqreturn_t kvaser_pciefd_irq_handler(int irq, void *dev) const struct kvaser_pciefd_irq_mask *irq_mask = pcie->driver_data->irq_mask; u32 pci_irq = ioread32(KVASER_PCIEFD_PCI_IRQ_ADDR(pcie)); u32 srb_irq = 0; + u32 srb_release = 0; int i; if (!(pci_irq & irq_mask->all)) @@ -1699,17 +1700,14 @@ static irqreturn_t kvaser_pciefd_irq_handler(int irq, void *dev) kvaser_pciefd_transmit_irq(pcie->can[i]); } - if (srb_irq & KVASER_PCIEFD_SRB_IRQ_DPD0) { - /* Reset DMA buffer 0, may trigger new interrupt */ - iowrite32(KVASER_PCIEFD_SRB_CMD_RDB0, - KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_CMD_REG); - } + if (srb_irq & KVASER_PCIEFD_SRB_IRQ_DPD0) + srb_release |= KVASER_PCIEFD_SRB_CMD_RDB0; - if (srb_irq & KVASER_PCIEFD_SRB_IRQ_DPD1) { - /* Reset DMA buffer 1, may trigger new interrupt */ - iowrite32(KVASER_PCIEFD_SRB_CMD_RDB1, - KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_CMD_REG); - } + if (srb_irq & KVASER_PCIEFD_SRB_IRQ_DPD1) + srb_release |= KVASER_PCIEFD_SRB_CMD_RDB1; + + if (srb_release) + iowrite32(srb_release, KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_CMD_REG); return IRQ_HANDLED; } From 8ae22de9d2eae3c432de64bf2b3a5a69cf1d1124 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 21 Aug 2024 15:43:40 -0700 Subject: [PATCH 20/61] Bluetooth: qca: If memdump doesn't work, re-enable IBS On systems in the field, we are seeing this sometimes in the kernel logs: Bluetooth: qca_controller_memdump() hci0: hci_devcd_init Return:-95 This means that _something_ decided that it wanted to get a memdump but then hci_devcd_init() returned -EOPNOTSUPP (AKA -95). The cleanup code in qca_controller_memdump() when we get back an error from hci_devcd_init() undoes most things but forgets to clear QCA_IBS_DISABLED. One side effect of this is that, during the next suspend, qca_suspend() will always get a timeout. Let's fix it so that we clear the bit. Fixes: 06d3fdfcdf5c ("Bluetooth: hci_qca: Add qcom devcoredump support") Reviewed-by: Guenter Roeck Reviewed-by: Stephen Boyd Signed-off-by: Douglas Anderson Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 4b1ad7ea5b95..678f150229e7 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1091,6 +1091,7 @@ static void qca_controller_memdump(struct work_struct *work) qca->memdump_state = QCA_MEMDUMP_COLLECTED; cancel_delayed_work(&qca->ctrl_memdump_timeout); clear_bit(QCA_MEMDUMP_COLLECTION, &qca->flags); + clear_bit(QCA_IBS_DISABLED, &qca->flags); mutex_unlock(&qca->hci_memdump_lock); return; } From c898f6d7b093bd71e66569cd6797c87d4056f44b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 26 Aug 2024 15:47:30 -0400 Subject: [PATCH 21/61] Bluetooth: hci_sync: Introduce hci_cmd_sync_run/hci_cmd_sync_run_once This introduces hci_cmd_sync_run/hci_cmd_sync_run_once which acts like hci_cmd_sync_queue/hci_cmd_sync_queue_once but runs immediately when already on hdev->cmd_sync_work context. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sync.h | 4 +++ net/bluetooth/hci_sync.c | 42 ++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 75e052909b5f..f3052cb252ef 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -73,6 +73,10 @@ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy); int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy); +int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); struct hci_cmd_sync_work_entry * hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e79cd40bd079..5533e6f561b3 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -112,7 +112,7 @@ static void hci_cmd_sync_add(struct hci_request *req, u16 opcode, u32 plen, skb_queue_tail(&req->cmd_q, skb); } -static int hci_cmd_sync_run(struct hci_request *req) +static int hci_req_sync_run(struct hci_request *req) { struct hci_dev *hdev = req->hdev; struct sk_buff *skb; @@ -169,7 +169,7 @@ struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen, hdev->req_status = HCI_REQ_PEND; - err = hci_cmd_sync_run(&req); + err = hci_req_sync_run(&req); if (err < 0) return ERR_PTR(err); @@ -782,6 +782,44 @@ int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, } EXPORT_SYMBOL(hci_cmd_sync_queue_once); +/* Run HCI command: + * + * - hdev must be running + * - if on cmd_sync_work then run immediately otherwise queue + */ +int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy) +{ + /* Only queue command if hdev is running which means it had been opened + * and is either on init phase or is already up. + */ + if (!test_bit(HCI_RUNNING, &hdev->flags)) + return -ENETDOWN; + + /* If on cmd_sync_work then run immediately otherwise queue */ + if (current_work() == &hdev->cmd_sync_work) + return func(hdev, data); + + return hci_cmd_sync_submit(hdev, func, data, destroy); +} +EXPORT_SYMBOL(hci_cmd_sync_run); + +/* Run HCI command entry once: + * + * - Lookup if an entry already exist and only if it doesn't creates a new entry + * and run it. + * - if on cmd_sync_work then run immediately otherwise queue + */ +int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy) +{ + if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) + return 0; + + return hci_cmd_sync_run(hdev, func, data, destroy); +} +EXPORT_SYMBOL(hci_cmd_sync_run_once); + /* Lookup HCI command entry: * * - Return first entry that matches by function callback or data or From 227a0cdf4a028a73dc256d0f5144b4808d718893 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 26 Aug 2024 16:14:04 -0400 Subject: [PATCH 22/61] Bluetooth: MGMT: Fix not generating command complete for MGMT_OP_DISCONNECT MGMT_OP_DISCONNECT can be called while mgmt_device_connected has not been called yet, which will cause the connection procedure to be aborted, so mgmt_device_disconnected shall still respond with command complete to MGMT_OP_DISCONNECT and just not emit MGMT_EV_DEVICE_DISCONNECTED since MGMT_EV_DEVICE_CONNECTED was never sent. To fix this MGMT_OP_DISCONNECT is changed to work similarly to other command which do use hci_cmd_sync_queue and then use hci_conn_abort to disconnect and returns the result, in order for hci_conn_abort to be used from hci_cmd_sync context it now uses hci_cmd_sync_run_once. Link: https://github.com/bluez/bluez/issues/932 Fixes: 12d4a3b2ccb3 ("Bluetooth: Move check for MGMT_CONNECTED flag into mgmt.c") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 6 ++- net/bluetooth/mgmt.c | 84 ++++++++++++++++++++-------------------- 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 8e48ccd2af30..c82502e213a8 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2952,5 +2952,9 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) return 0; } - return hci_cmd_sync_queue_once(hdev, abort_conn_sync, conn, NULL); + /* Run immediately if on cmd_sync_work since this may be called + * as a result to MGMT_OP_DISCONNECT/MGMT_OP_UNPAIR which does + * already queue its callback on cmd_sync_work. + */ + return hci_cmd_sync_run_once(hdev, abort_conn_sync, conn, NULL); } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 25979f4283a6..4c20dbf92c71 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2921,7 +2921,12 @@ static int unpair_device_sync(struct hci_dev *hdev, void *data) if (!conn) return 0; - return hci_abort_conn_sync(hdev, conn, HCI_ERROR_REMOTE_USER_TERM); + /* Disregard any possible error since the likes of hci_abort_conn_sync + * will clean up the connection no matter the error. + */ + hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); + + return 0; } static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, @@ -3053,13 +3058,44 @@ unlock: return err; } +static void disconnect_complete(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_pending_cmd *cmd = data; + + cmd->cmd_complete(cmd, mgmt_status(err)); + mgmt_pending_free(cmd); +} + +static int disconnect_sync(struct hci_dev *hdev, void *data) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_disconnect *cp = cmd->param; + struct hci_conn *conn; + + if (cp->addr.type == BDADDR_BREDR) + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, + &cp->addr.bdaddr); + else + conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + + if (!conn) + return -ENOTCONN; + + /* Disregard any possible error since the likes of hci_abort_conn_sync + * will clean up the connection no matter the error. + */ + hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); + + return 0; +} + static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_disconnect *cp = data; struct mgmt_rp_disconnect rp; struct mgmt_pending_cmd *cmd; - struct hci_conn *conn; int err; bt_dev_dbg(hdev, "sock %p", sk); @@ -3082,27 +3118,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - if (pending_find(MGMT_OP_DISCONNECT, hdev)) { - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_BUSY, &rp, sizeof(rp)); - goto failed; - } - - if (cp->addr.type == BDADDR_BREDR) - conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, - &cp->addr.bdaddr); - else - conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, - le_addr_type(cp->addr.type)); - - if (!conn || conn->state == BT_OPEN || conn->state == BT_CLOSED) { - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_NOT_CONNECTED, &rp, - sizeof(rp)); - goto failed; - } - - cmd = mgmt_pending_add(sk, MGMT_OP_DISCONNECT, hdev, data, len); + cmd = mgmt_pending_new(sk, MGMT_OP_DISCONNECT, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -3110,9 +3126,10 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, cmd->cmd_complete = generic_cmd_complete; - err = hci_disconnect(conn, HCI_ERROR_REMOTE_USER_TERM); + err = hci_cmd_sync_queue(hdev, disconnect_sync, cmd, + disconnect_complete); if (err < 0) - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); failed: hci_dev_unlock(hdev); @@ -9689,18 +9706,6 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, mgmt_event_skb(skb, NULL); } -static void disconnect_rsp(struct mgmt_pending_cmd *cmd, void *data) -{ - struct sock **sk = data; - - cmd->cmd_complete(cmd, 0); - - *sk = cmd->sk; - sock_hold(*sk); - - mgmt_pending_remove(cmd); -} - static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data) { struct hci_dev *hdev = data; @@ -9744,8 +9749,6 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, if (link_type != ACL_LINK && link_type != LE_LINK) return; - mgmt_pending_foreach(MGMT_OP_DISCONNECT, hdev, disconnect_rsp, &sk); - bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); ev.reason = reason; @@ -9758,9 +9761,6 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, if (sk) sock_put(sk); - - mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp, - hdev); } void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, From 532f8bcd1c2c4e8112f62e1922fd1703bc0ffce0 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 27 Aug 2024 14:37:22 -0400 Subject: [PATCH 23/61] Revert "Bluetooth: MGMT/SMP: Fix address type when using SMP over BREDR/LE" This reverts commit 59b047bc98084f8af2c41483e4d68a5adf2fa7f7 which breaks compatibility with commands like: bluetoothd[46328]: @ MGMT Command: Load.. (0x0013) plen 74 {0x0001} [hci0] Keys: 2 BR/EDR Address: C0:DC:DA:A5:E5:47 (Samsung Electronics Co.,Ltd) Key type: Authenticated key from P-256 (0x03) Central: 0x00 Encryption size: 16 Diversifier[2]: 0000 Randomizer[8]: 0000000000000000 Key[16]: 6ed96089bd9765be2f2c971b0b95f624 LE Address: D7:2A:DE:1E:73:A2 (Static) Key type: Unauthenticated key from P-256 (0x02) Central: 0x00 Encryption size: 16 Diversifier[2]: 0000 Randomizer[8]: 0000000000000000 Key[16]: 87dd2546ededda380ffcdc0a8faa4597 @ MGMT Event: Command Status (0x0002) plen 3 {0x0001} [hci0] Load Long Term Keys (0x0013) Status: Invalid Parameters (0x0d) Cc: stable@vger.kernel.org Link: https://github.com/bluez/bluez/issues/875 Fixes: 59b047bc9808 ("Bluetooth: MGMT/SMP: Fix address type when using SMP over BREDR/LE") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 5 ----- net/bluetooth/mgmt.c | 25 +++++++------------------ net/bluetooth/smp.c | 7 ------- 3 files changed, 7 insertions(+), 30 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index e449dba698f3..1a32e602630e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -186,7 +186,6 @@ struct blocked_key { struct smp_csrk { bdaddr_t bdaddr; u8 bdaddr_type; - u8 link_type; u8 type; u8 val[16]; }; @@ -196,7 +195,6 @@ struct smp_ltk { struct rcu_head rcu; bdaddr_t bdaddr; u8 bdaddr_type; - u8 link_type; u8 authenticated; u8 type; u8 enc_size; @@ -211,7 +209,6 @@ struct smp_irk { bdaddr_t rpa; bdaddr_t bdaddr; u8 addr_type; - u8 link_type; u8 val[16]; }; @@ -219,8 +216,6 @@ struct link_key { struct list_head list; struct rcu_head rcu; bdaddr_t bdaddr; - u8 bdaddr_type; - u8 link_type; u8 type; u8 val[HCI_LINK_KEY_SIZE]; u8 pin_len; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 4c20dbf92c71..240dd8cf7c7d 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2833,8 +2833,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, for (i = 0; i < key_count; i++) { struct mgmt_link_key_info *key = &cp->keys[i]; - /* Considering SMP over BREDR/LE, there is no need to check addr_type */ - if (key->type > 0x08) + if (key->addr.type != BDADDR_BREDR || key->type > 0x08) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); @@ -7089,7 +7088,6 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, for (i = 0; i < irk_count; i++) { struct mgmt_irk_info *irk = &cp->irks[i]; - u8 addr_type = le_addr_type(irk->addr.type); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, @@ -7099,12 +7097,8 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, continue; } - /* When using SMP over BR/EDR, the addr type should be set to BREDR */ - if (irk->addr.type == BDADDR_BREDR) - addr_type = BDADDR_BREDR; - hci_add_irk(hdev, &irk->addr.bdaddr, - addr_type, irk->val, + le_addr_type(irk->addr.type), irk->val, BDADDR_ANY); } @@ -7185,7 +7179,6 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, for (i = 0; i < key_count; i++) { struct mgmt_ltk_info *key = &cp->keys[i]; u8 type, authenticated; - u8 addr_type = le_addr_type(key->addr.type); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, @@ -7220,12 +7213,8 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, continue; } - /* When using SMP over BR/EDR, the addr type should be set to BREDR */ - if (key->addr.type == BDADDR_BREDR) - addr_type = BDADDR_BREDR; - hci_add_ltk(hdev, &key->addr.bdaddr, - addr_type, type, authenticated, + le_addr_type(key->addr.type), type, authenticated, key->val, key->enc_size, key->ediv, key->rand); } @@ -9519,7 +9508,7 @@ void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, ev.store_hint = persistent; bacpy(&ev.key.addr.bdaddr, &key->bdaddr); - ev.key.addr.type = link_to_bdaddr(key->link_type, key->bdaddr_type); + ev.key.addr.type = BDADDR_BREDR; ev.key.type = key->type; memcpy(ev.key.val, key->val, HCI_LINK_KEY_SIZE); ev.key.pin_len = key->pin_len; @@ -9570,7 +9559,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) ev.store_hint = persistent; bacpy(&ev.key.addr.bdaddr, &key->bdaddr); - ev.key.addr.type = link_to_bdaddr(key->link_type, key->bdaddr_type); + ev.key.addr.type = link_to_bdaddr(LE_LINK, key->bdaddr_type); ev.key.type = mgmt_ltk_type(key); ev.key.enc_size = key->enc_size; ev.key.ediv = key->ediv; @@ -9599,7 +9588,7 @@ void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk, bool persistent) bacpy(&ev.rpa, &irk->rpa); bacpy(&ev.irk.addr.bdaddr, &irk->bdaddr); - ev.irk.addr.type = link_to_bdaddr(irk->link_type, irk->addr_type); + ev.irk.addr.type = link_to_bdaddr(LE_LINK, irk->addr_type); memcpy(ev.irk.val, irk->val, sizeof(irk->val)); mgmt_event(MGMT_EV_NEW_IRK, hdev, &ev, sizeof(ev), NULL); @@ -9628,7 +9617,7 @@ void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, ev.store_hint = persistent; bacpy(&ev.key.addr.bdaddr, &csrk->bdaddr); - ev.key.addr.type = link_to_bdaddr(csrk->link_type, csrk->bdaddr_type); + ev.key.addr.type = link_to_bdaddr(LE_LINK, csrk->bdaddr_type); ev.key.type = csrk->type; memcpy(ev.key.val, csrk->val, sizeof(csrk->val)); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 4f9fdf400584..8b9724fd752a 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1060,7 +1060,6 @@ static void smp_notify_keys(struct l2cap_conn *conn) } if (smp->remote_irk) { - smp->remote_irk->link_type = hcon->type; mgmt_new_irk(hdev, smp->remote_irk, persistent); /* Now that user space can be considered to know the @@ -1080,28 +1079,24 @@ static void smp_notify_keys(struct l2cap_conn *conn) } if (smp->csrk) { - smp->csrk->link_type = hcon->type; smp->csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->csrk, persistent); } if (smp->responder_csrk) { - smp->responder_csrk->link_type = hcon->type; smp->responder_csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->responder_csrk, persistent); } if (smp->ltk) { - smp->ltk->link_type = hcon->type; smp->ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->ltk, persistent); } if (smp->responder_ltk) { - smp->responder_ltk->link_type = hcon->type; smp->responder_ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->responder_ltk, persistent); @@ -1121,8 +1116,6 @@ static void smp_notify_keys(struct l2cap_conn *conn) key = hci_add_link_key(hdev, smp->conn->hcon, &hcon->dst, smp->link_key, type, 0, &persistent); if (key) { - key->link_type = hcon->type; - key->bdaddr_type = hcon->dst_type; mgmt_new_link_key(hdev, key, persistent); /* Don't keep debug keys around if the relevant From 1e9683c9b6ca88cc9340cdca85edd6134c8cffe3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 27 Aug 2024 15:01:34 -0400 Subject: [PATCH 24/61] Bluetooth: MGMT: Ignore keys being loaded with invalid type Due to 59b047bc98084f8af2c41483e4d68a5adf2fa7f7 there could be keys stored with the wrong address type so this attempt to detect it and ignore them instead of just failing to load all keys. Cc: stable@vger.kernel.org Link: https://github.com/bluez/bluez/issues/875 Fixes: 59b047bc9808 ("Bluetooth: MGMT/SMP: Fix address type when using SMP over BREDR/LE") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 240dd8cf7c7d..279902e8bd8a 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2830,15 +2830,6 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "debug_keys %u key_count %u", cp->debug_keys, key_count); - for (i = 0; i < key_count; i++) { - struct mgmt_link_key_info *key = &cp->keys[i]; - - if (key->addr.type != BDADDR_BREDR || key->type > 0x08) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_LOAD_LINK_KEYS, - MGMT_STATUS_INVALID_PARAMS); - } - hci_dev_lock(hdev); hci_link_keys_clear(hdev); @@ -2863,6 +2854,19 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, continue; } + if (key->addr.type != BDADDR_BREDR) { + bt_dev_warn(hdev, + "Invalid link address type %u for %pMR", + key->addr.type, &key->addr.bdaddr); + continue; + } + + if (key->type > 0x08) { + bt_dev_warn(hdev, "Invalid link key type %u for %pMR", + key->type, &key->addr.bdaddr); + continue; + } + /* Always ignore debug keys and require a new pairing if * the user wants to use them. */ @@ -7163,15 +7167,6 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, bt_dev_dbg(hdev, "key_count %u", key_count); - for (i = 0; i < key_count; i++) { - struct mgmt_ltk_info *key = &cp->keys[i]; - - if (!ltk_is_valid(key)) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_LOAD_LONG_TERM_KEYS, - MGMT_STATUS_INVALID_PARAMS); - } - hci_dev_lock(hdev); hci_smp_ltks_clear(hdev); @@ -7188,6 +7183,12 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, continue; } + if (!ltk_is_valid(key)) { + bt_dev_warn(hdev, "Invalid LTK for %pMR", + &key->addr.bdaddr); + continue; + } + switch (key->type) { case MGMT_LTK_UNAUTHENTICATED: authenticated = 0x00; From 4fa9c5181cfe083d0beefb5157a643560e7bd152 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Thu, 29 Aug 2024 15:43:45 +0800 Subject: [PATCH 25/61] net: mctp-serial: Add kunit test for next_chunk_len() Test various edge cases of inputs that contain characters that need escaping. This adds a new kunit suite for mctp-serial. Signed-off-by: Matt Johnston Signed-off-by: David S. Miller --- drivers/net/mctp/Kconfig | 5 ++ drivers/net/mctp/mctp-serial.c | 109 +++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/drivers/net/mctp/Kconfig b/drivers/net/mctp/Kconfig index ce9d2d2ccf3b..15860d6ac39f 100644 --- a/drivers/net/mctp/Kconfig +++ b/drivers/net/mctp/Kconfig @@ -21,6 +21,11 @@ config MCTP_SERIAL Say y here if you need to connect to MCTP endpoints over serial. To compile as a module, use m; the module will be called mctp-serial. +config MCTP_SERIAL_TEST + bool "MCTP serial tests" if !KUNIT_ALL_TESTS + depends on MCTP_SERIAL=y && KUNIT=y + default KUNIT_ALL_TESTS + config MCTP_TRANSPORT_I2C tristate "MCTP SMBus/I2C transport" # i2c-mux is optional, but we must build as a module if i2c-mux is a module diff --git a/drivers/net/mctp/mctp-serial.c b/drivers/net/mctp/mctp-serial.c index 5bf6fdff701c..7a40d07ff77b 100644 --- a/drivers/net/mctp/mctp-serial.c +++ b/drivers/net/mctp/mctp-serial.c @@ -521,3 +521,112 @@ module_exit(mctp_serial_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jeremy Kerr "); MODULE_DESCRIPTION("MCTP Serial transport"); + +#if IS_ENABLED(CONFIG_MCTP_SERIAL_TEST) +#include + +#define MAX_CHUNKS 6 +struct test_chunk_tx { + u8 input_len; + u8 input[MCTP_SERIAL_MTU]; + u8 chunks[MAX_CHUNKS]; +}; + +static void test_next_chunk_len(struct kunit *test) +{ + struct mctp_serial devx; + struct mctp_serial *dev = &devx; + int next; + + const struct test_chunk_tx *params = test->param_value; + + memset(dev, 0x0, sizeof(*dev)); + memcpy(dev->txbuf, params->input, params->input_len); + dev->txlen = params->input_len; + + for (size_t i = 0; i < MAX_CHUNKS; i++) { + next = next_chunk_len(dev); + dev->txpos += next; + KUNIT_EXPECT_EQ(test, next, params->chunks[i]); + + if (next == 0) { + KUNIT_EXPECT_EQ(test, dev->txpos, dev->txlen); + return; + } + } + + KUNIT_FAIL_AND_ABORT(test, "Ran out of chunks"); +} + +static struct test_chunk_tx chunk_tx_tests[] = { + { + .input_len = 5, + .input = { 0x00, 0x11, 0x22, 0x7e, 0x80 }, + .chunks = { 3, 1, 1, 0}, + }, + { + .input_len = 5, + .input = { 0x00, 0x11, 0x22, 0x7e, 0x7d }, + .chunks = { 3, 1, 1, 0}, + }, + { + .input_len = 3, + .input = { 0x7e, 0x11, 0x22, }, + .chunks = { 1, 2, 0}, + }, + { + .input_len = 3, + .input = { 0x7e, 0x7e, 0x7d, }, + .chunks = { 1, 1, 1, 0}, + }, + { + .input_len = 4, + .input = { 0x7e, 0x7e, 0x00, 0x7d, }, + .chunks = { 1, 1, 1, 1, 0}, + }, + { + .input_len = 6, + .input = { 0x7e, 0x7e, 0x00, 0x7d, 0x10, 0x10}, + .chunks = { 1, 1, 1, 1, 2, 0}, + }, + { + .input_len = 1, + .input = { 0x7e }, + .chunks = { 1, 0 }, + }, + { + .input_len = 1, + .input = { 0x80 }, + .chunks = { 1, 0 }, + }, + { + .input_len = 3, + .input = { 0x80, 0x80, 0x00 }, + .chunks = { 3, 0 }, + }, + { + .input_len = 7, + .input = { 0x01, 0x00, 0x08, 0xc8, 0x00, 0x80, 0x02 }, + .chunks = { 7, 0 }, + }, + { + .input_len = 7, + .input = { 0x01, 0x00, 0x08, 0xc8, 0x7e, 0x80, 0x02 }, + .chunks = { 4, 1, 2, 0 }, + }, +}; + +KUNIT_ARRAY_PARAM(chunk_tx, chunk_tx_tests, NULL); + +static struct kunit_case mctp_serial_test_cases[] = { + KUNIT_CASE_PARAM(test_next_chunk_len, chunk_tx_gen_params), +}; + +static struct kunit_suite mctp_serial_test_suite = { + .name = "mctp_serial", + .test_cases = mctp_serial_test_cases, +}; + +kunit_test_suite(mctp_serial_test_suite); + +#endif /* CONFIG_MCTP_SERIAL_TEST */ From f962e8361adfa84e8252d3fc3e5e6bb879f029b1 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Thu, 29 Aug 2024 15:43:46 +0800 Subject: [PATCH 26/61] net: mctp-serial: Fix missing escapes on transmit 0x7d and 0x7e bytes are meant to be escaped in the data portion of frames, but this didn't occur since next_chunk_len() had an off-by-one error. That also resulted in the final byte of a payload being written as a separate tty write op. The chunk prior to an escaped byte would be one byte short, and the next call would never test the txpos+1 case, which is where the escaped byte was located. That meant it never hit the escaping case in mctp_serial_tx_work(). Example Input: 01 00 08 c8 7e 80 02 Previous incorrect chunks from next_chunk_len(): 01 00 08 c8 7e 80 02 With this fix: 01 00 08 c8 7e 80 02 Cc: stable@vger.kernel.org Fixes: a0c2ccd9b5ad ("mctp: Add MCTP-over-serial transport binding") Signed-off-by: Matt Johnston Signed-off-by: David S. Miller --- drivers/net/mctp/mctp-serial.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/mctp/mctp-serial.c b/drivers/net/mctp/mctp-serial.c index 7a40d07ff77b..f39bbe255497 100644 --- a/drivers/net/mctp/mctp-serial.c +++ b/drivers/net/mctp/mctp-serial.c @@ -91,8 +91,8 @@ static int next_chunk_len(struct mctp_serial *dev) * will be those non-escaped bytes, and does not include the escaped * byte. */ - for (i = 1; i + dev->txpos + 1 < dev->txlen; i++) { - if (needs_escape(dev->txbuf[dev->txpos + i + 1])) + for (i = 1; i + dev->txpos < dev->txlen; i++) { + if (needs_escape(dev->txbuf[dev->txpos + i])) break; } From a3c1e45156ad39f225cd7ddae0f81230a3b1e657 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jens=20Emil=20Schulz=20=C3=98stergaard?= Date: Thu, 29 Aug 2024 11:52:54 +0200 Subject: [PATCH 27/61] net: microchip: vcap: Fix use-after-free error in kunit test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a clear use-after-free error. We remove it, and rely on checking the return code of vcap_del_rule. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/kernel-janitors/7bffefc6-219a-4f71-baa0-ad4526e5c198@kili.mountain/ Fixes: c956b9b318d9 ("net: microchip: sparx5: Adding KUNIT tests of key/action values in VCAP API") Signed-off-by: Jens Emil Schulz Østergaard Signed-off-by: David S. Miller --- .../net/ethernet/microchip/vcap/vcap_api_kunit.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c b/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c index 51d9423b08a6..f2a5a36fdacd 100644 --- a/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c +++ b/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c @@ -1442,18 +1442,8 @@ static void vcap_api_encode_rule_test(struct kunit *test) vcap_enable_lookups(&test_vctrl, &test_netdev, 0, 0, rule->cookie, false); - vcap_free_rule(rule); - - /* Check that the rule has been freed: tricky to access since this - * memory should not be accessible anymore - */ - KUNIT_EXPECT_PTR_NE(test, NULL, rule); - ret = list_empty(&rule->keyfields); - KUNIT_EXPECT_EQ(test, true, ret); - ret = list_empty(&rule->actionfields); - KUNIT_EXPECT_EQ(test, true, ret); - - vcap_del_rule(&test_vctrl, &test_netdev, id); + ret = vcap_del_rule(&test_vctrl, &test_netdev, id); + KUNIT_EXPECT_EQ(test, 0, ret); } static void vcap_api_set_rule_counter_test(struct kunit *test) From ef4a99a0164e3972abb421cbb1b09ea6c61414df Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 29 Aug 2024 22:22:45 +0300 Subject: [PATCH 28/61] igc: Unlock on error in igc_io_resume() Call rtnl_unlock() on this error path, before returning. Fixes: bc23aa949aeb ("igc: Add pcie error handler support") Signed-off-by: Dan Carpenter Reviewed-by: Gerhard Engleder Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igc/igc_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index dfd6c00b4205..0a095cdea4fb 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7413,6 +7413,7 @@ static void igc_io_resume(struct pci_dev *pdev) rtnl_lock(); if (netif_running(netdev)) { if (igc_open(netdev)) { + rtnl_unlock(); netdev_err(netdev, "igc_open failed after reset\n"); return; } From d3e154d7776ba57ab679fb816fb87b627fba21c9 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Fri, 30 Aug 2024 15:34:19 +0800 Subject: [PATCH 29/61] Revert "wifi: ath11k: restore country code during resume" This reverts commit 7f0343b7b8710436c1e6355c71782d32ada47e0c. We are going to revert commit 166a490f59ac ("wifi: ath11k: support hibernation"), on which this commit depends. With that commit reverted, this one is not needed any more, so revert this commit first. Signed-off-by: Baochen Qiang Acked-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240830073420.5790-2-quic_bqiang@quicinc.com --- drivers/net/wireless/ath/ath11k/core.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c index 03187df26000..325b930aaf06 100644 --- a/drivers/net/wireless/ath/ath11k/core.c +++ b/drivers/net/wireless/ath/ath11k/core.c @@ -1009,16 +1009,6 @@ int ath11k_core_resume(struct ath11k_base *ab) return -ETIMEDOUT; } - if (ab->hw_params.current_cc_support && - ar->alpha2[0] != 0 && ar->alpha2[1] != 0) { - ret = ath11k_reg_set_cc(ar); - if (ret) { - ath11k_warn(ab, "failed to set country code during resume: %d\n", - ret); - return ret; - } - } - ret = ath11k_dp_rx_pktlog_start(ab); if (ret) ath11k_warn(ab, "failed to start rx pktlog during resume: %d\n", From 2f833e8948d6c88a3a257d4e426c9897b4907d5a Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Fri, 30 Aug 2024 15:34:20 +0800 Subject: [PATCH 30/61] Revert "wifi: ath11k: support hibernation" This reverts commit 166a490f59ac10340ee5330e51c15188ce2a7f8f. There are several reports that this commit breaks system suspend on some specific Lenovo platforms. Since there is no fix available, for now revert this commit to make suspend work again on those platforms. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219196 Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2301921 Cc: # 6.10.x: d3e154d7776b: Revert "wifi: ath11k: restore country code during resume" Cc: # 6.10.x Signed-off-by: Baochen Qiang Acked-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240830073420.5790-3-quic_bqiang@quicinc.com --- drivers/net/wireless/ath/ath11k/ahb.c | 4 +- drivers/net/wireless/ath/ath11k/core.c | 111 ++++++++----------------- drivers/net/wireless/ath/ath11k/core.h | 4 - drivers/net/wireless/ath/ath11k/hif.h | 12 +-- drivers/net/wireless/ath/ath11k/mhi.c | 12 +-- drivers/net/wireless/ath/ath11k/mhi.h | 3 +- drivers/net/wireless/ath/ath11k/pci.c | 44 ++-------- drivers/net/wireless/ath/ath11k/qmi.c | 2 +- 8 files changed, 51 insertions(+), 141 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/ahb.c b/drivers/net/wireless/ath/ath11k/ahb.c index 634d385fd9ad..97b12f51ef28 100644 --- a/drivers/net/wireless/ath/ath11k/ahb.c +++ b/drivers/net/wireless/ath/ath11k/ahb.c @@ -413,7 +413,7 @@ static int ath11k_ahb_power_up(struct ath11k_base *ab) return ret; } -static void ath11k_ahb_power_down(struct ath11k_base *ab, bool is_suspend) +static void ath11k_ahb_power_down(struct ath11k_base *ab) { struct ath11k_ahb *ab_ahb = ath11k_ahb_priv(ab); @@ -1280,7 +1280,7 @@ static void ath11k_ahb_remove(struct platform_device *pdev) struct ath11k_base *ab = platform_get_drvdata(pdev); if (test_bit(ATH11K_FLAG_QMI_FAIL, &ab->dev_flags)) { - ath11k_ahb_power_down(ab, false); + ath11k_ahb_power_down(ab); ath11k_debugfs_soc_destroy(ab); ath11k_qmi_deinit_service(ab); goto qmi_fail; diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c index 325b930aaf06..ccf4ad35fdc3 100644 --- a/drivers/net/wireless/ath/ath11k/core.c +++ b/drivers/net/wireless/ath/ath11k/core.c @@ -906,6 +906,12 @@ int ath11k_core_suspend(struct ath11k_base *ab) return ret; } + ret = ath11k_wow_enable(ab); + if (ret) { + ath11k_warn(ab, "failed to enable wow during suspend: %d\n", ret); + return ret; + } + ret = ath11k_dp_rx_pktlog_stop(ab, false); if (ret) { ath11k_warn(ab, "failed to stop dp rx pktlog during suspend: %d\n", @@ -916,85 +922,29 @@ int ath11k_core_suspend(struct ath11k_base *ab) ath11k_ce_stop_shadow_timers(ab); ath11k_dp_stop_shadow_timers(ab); - /* PM framework skips suspend_late/resume_early callbacks - * if other devices report errors in their suspend callbacks. - * However ath11k_core_resume() would still be called because - * here we return success thus kernel put us on dpm_suspended_list. - * Since we won't go through a power down/up cycle, there is - * no chance to call complete(&ab->restart_completed) in - * ath11k_core_restart(), making ath11k_core_resume() timeout. - * So call it here to avoid this issue. This also works in case - * no error happens thus suspend_late/resume_early get called, - * because it will be reinitialized in ath11k_core_resume_early(). - */ - complete(&ab->restart_completed); + ath11k_hif_irq_disable(ab); + ath11k_hif_ce_irq_disable(ab); + + ret = ath11k_hif_suspend(ab); + if (ret) { + ath11k_warn(ab, "failed to suspend hif: %d\n", ret); + return ret; + } return 0; } EXPORT_SYMBOL(ath11k_core_suspend); -int ath11k_core_suspend_late(struct ath11k_base *ab) -{ - struct ath11k_pdev *pdev; - struct ath11k *ar; - - if (!ab->hw_params.supports_suspend) - return -EOPNOTSUPP; - - /* so far single_pdev_only chips have supports_suspend as true - * and only the first pdev is valid. - */ - pdev = ath11k_core_get_single_pdev(ab); - ar = pdev->ar; - if (!ar || ar->state != ATH11K_STATE_OFF) - return 0; - - ath11k_hif_irq_disable(ab); - ath11k_hif_ce_irq_disable(ab); - - ath11k_hif_power_down(ab, true); - - return 0; -} -EXPORT_SYMBOL(ath11k_core_suspend_late); - -int ath11k_core_resume_early(struct ath11k_base *ab) -{ - int ret; - struct ath11k_pdev *pdev; - struct ath11k *ar; - - if (!ab->hw_params.supports_suspend) - return -EOPNOTSUPP; - - /* so far single_pdev_only chips have supports_suspend as true - * and only the first pdev is valid. - */ - pdev = ath11k_core_get_single_pdev(ab); - ar = pdev->ar; - if (!ar || ar->state != ATH11K_STATE_OFF) - return 0; - - reinit_completion(&ab->restart_completed); - ret = ath11k_hif_power_up(ab); - if (ret) - ath11k_warn(ab, "failed to power up hif during resume: %d\n", ret); - - return ret; -} -EXPORT_SYMBOL(ath11k_core_resume_early); - int ath11k_core_resume(struct ath11k_base *ab) { int ret; struct ath11k_pdev *pdev; struct ath11k *ar; - long time_left; if (!ab->hw_params.supports_suspend) return -EOPNOTSUPP; - /* so far single_pdev_only chips have supports_suspend as true + /* so far signle_pdev_only chips have supports_suspend as true * and only the first pdev is valid. */ pdev = ath11k_core_get_single_pdev(ab); @@ -1002,19 +952,29 @@ int ath11k_core_resume(struct ath11k_base *ab) if (!ar || ar->state != ATH11K_STATE_OFF) return 0; - time_left = wait_for_completion_timeout(&ab->restart_completed, - ATH11K_RESET_TIMEOUT_HZ); - if (time_left == 0) { - ath11k_warn(ab, "timeout while waiting for restart complete"); - return -ETIMEDOUT; + ret = ath11k_hif_resume(ab); + if (ret) { + ath11k_warn(ab, "failed to resume hif during resume: %d\n", ret); + return ret; } + ath11k_hif_ce_irq_enable(ab); + ath11k_hif_irq_enable(ab); + ret = ath11k_dp_rx_pktlog_start(ab); - if (ret) + if (ret) { ath11k_warn(ab, "failed to start rx pktlog during resume: %d\n", ret); + return ret; + } - return ret; + ret = ath11k_wow_wakeup(ab); + if (ret) { + ath11k_warn(ab, "failed to wakeup wow during resume: %d\n", ret); + return ret; + } + + return 0; } EXPORT_SYMBOL(ath11k_core_resume); @@ -2109,8 +2069,6 @@ static void ath11k_core_restart(struct work_struct *work) if (!ab->is_reset) ath11k_core_post_reconfigure_recovery(ab); - - complete(&ab->restart_completed); } static void ath11k_core_reset(struct work_struct *work) @@ -2180,7 +2138,7 @@ static void ath11k_core_reset(struct work_struct *work) ath11k_hif_irq_disable(ab); ath11k_hif_ce_irq_disable(ab); - ath11k_hif_power_down(ab, false); + ath11k_hif_power_down(ab); ath11k_hif_power_up(ab); ath11k_dbg(ab, ATH11K_DBG_BOOT, "reset started\n"); @@ -2253,7 +2211,7 @@ void ath11k_core_deinit(struct ath11k_base *ab) mutex_unlock(&ab->core_lock); - ath11k_hif_power_down(ab, false); + ath11k_hif_power_down(ab); ath11k_mac_destroy(ab); ath11k_core_soc_destroy(ab); ath11k_fw_destroy(ab); @@ -2306,7 +2264,6 @@ struct ath11k_base *ath11k_core_alloc(struct device *dev, size_t priv_size, timer_setup(&ab->rx_replenish_retry, ath11k_ce_rx_replenish_retry, 0); init_completion(&ab->htc_suspend); init_completion(&ab->wow.wakeup_completed); - init_completion(&ab->restart_completed); ab->dev = dev; ab->hif.bus = bus; diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index df24f0e409af..b655967a465b 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -1036,8 +1036,6 @@ struct ath11k_base { DECLARE_BITMAP(fw_features, ATH11K_FW_FEATURE_COUNT); } fw; - struct completion restart_completed; - #ifdef CONFIG_NL80211_TESTMODE struct { u32 data_pos; @@ -1237,10 +1235,8 @@ void ath11k_core_free_bdf(struct ath11k_base *ab, struct ath11k_board_data *bd); int ath11k_core_check_dt(struct ath11k_base *ath11k); int ath11k_core_check_smbios(struct ath11k_base *ab); void ath11k_core_halt(struct ath11k *ar); -int ath11k_core_resume_early(struct ath11k_base *ab); int ath11k_core_resume(struct ath11k_base *ab); int ath11k_core_suspend(struct ath11k_base *ab); -int ath11k_core_suspend_late(struct ath11k_base *ab); void ath11k_core_pre_reconfigure_recovery(struct ath11k_base *ab); bool ath11k_core_coldboot_cal_support(struct ath11k_base *ab); diff --git a/drivers/net/wireless/ath/ath11k/hif.h b/drivers/net/wireless/ath/ath11k/hif.h index c4c6cc09c7c1..674ff772b181 100644 --- a/drivers/net/wireless/ath/ath11k/hif.h +++ b/drivers/net/wireless/ath/ath11k/hif.h @@ -18,7 +18,7 @@ struct ath11k_hif_ops { int (*start)(struct ath11k_base *ab); void (*stop)(struct ath11k_base *ab); int (*power_up)(struct ath11k_base *ab); - void (*power_down)(struct ath11k_base *ab, bool is_suspend); + void (*power_down)(struct ath11k_base *ab); int (*suspend)(struct ath11k_base *ab); int (*resume)(struct ath11k_base *ab); int (*map_service_to_pipe)(struct ath11k_base *ab, u16 service_id, @@ -67,18 +67,12 @@ static inline void ath11k_hif_irq_disable(struct ath11k_base *ab) static inline int ath11k_hif_power_up(struct ath11k_base *ab) { - if (!ab->hif.ops->power_up) - return -EOPNOTSUPP; - return ab->hif.ops->power_up(ab); } -static inline void ath11k_hif_power_down(struct ath11k_base *ab, bool is_suspend) +static inline void ath11k_hif_power_down(struct ath11k_base *ab) { - if (!ab->hif.ops->power_down) - return; - - ab->hif.ops->power_down(ab, is_suspend); + ab->hif.ops->power_down(ab); } static inline int ath11k_hif_suspend(struct ath11k_base *ab) diff --git a/drivers/net/wireless/ath/ath11k/mhi.c b/drivers/net/wireless/ath/ath11k/mhi.c index ab182690aed3..6974a551883f 100644 --- a/drivers/net/wireless/ath/ath11k/mhi.c +++ b/drivers/net/wireless/ath/ath11k/mhi.c @@ -453,17 +453,9 @@ int ath11k_mhi_start(struct ath11k_pci *ab_pci) return 0; } -void ath11k_mhi_stop(struct ath11k_pci *ab_pci, bool is_suspend) +void ath11k_mhi_stop(struct ath11k_pci *ab_pci) { - /* During suspend we need to use mhi_power_down_keep_dev() - * workaround, otherwise ath11k_core_resume() will timeout - * during resume. - */ - if (is_suspend) - mhi_power_down_keep_dev(ab_pci->mhi_ctrl, true); - else - mhi_power_down(ab_pci->mhi_ctrl, true); - + mhi_power_down(ab_pci->mhi_ctrl, true); mhi_unprepare_after_power_down(ab_pci->mhi_ctrl); } diff --git a/drivers/net/wireless/ath/ath11k/mhi.h b/drivers/net/wireless/ath/ath11k/mhi.h index 2d567705e732..a682aad52fc5 100644 --- a/drivers/net/wireless/ath/ath11k/mhi.h +++ b/drivers/net/wireless/ath/ath11k/mhi.h @@ -18,7 +18,7 @@ #define MHICTRL_RESET_MASK 0x2 int ath11k_mhi_start(struct ath11k_pci *ar_pci); -void ath11k_mhi_stop(struct ath11k_pci *ar_pci, bool is_suspend); +void ath11k_mhi_stop(struct ath11k_pci *ar_pci); int ath11k_mhi_register(struct ath11k_pci *ar_pci); void ath11k_mhi_unregister(struct ath11k_pci *ar_pci); void ath11k_mhi_set_mhictrl_reset(struct ath11k_base *ab); @@ -26,4 +26,5 @@ void ath11k_mhi_clear_vector(struct ath11k_base *ab); int ath11k_mhi_suspend(struct ath11k_pci *ar_pci); int ath11k_mhi_resume(struct ath11k_pci *ar_pci); + #endif diff --git a/drivers/net/wireless/ath/ath11k/pci.c b/drivers/net/wireless/ath/ath11k/pci.c index 8d63b84d1261..be9d2c69cc41 100644 --- a/drivers/net/wireless/ath/ath11k/pci.c +++ b/drivers/net/wireless/ath/ath11k/pci.c @@ -638,7 +638,7 @@ static int ath11k_pci_power_up(struct ath11k_base *ab) return 0; } -static void ath11k_pci_power_down(struct ath11k_base *ab, bool is_suspend) +static void ath11k_pci_power_down(struct ath11k_base *ab) { struct ath11k_pci *ab_pci = ath11k_pci_priv(ab); @@ -649,7 +649,7 @@ static void ath11k_pci_power_down(struct ath11k_base *ab, bool is_suspend) ath11k_pci_msi_disable(ab_pci); - ath11k_mhi_stop(ab_pci, is_suspend); + ath11k_mhi_stop(ab_pci); clear_bit(ATH11K_FLAG_DEVICE_INIT_DONE, &ab->dev_flags); ath11k_pci_sw_reset(ab_pci->ab, false); } @@ -970,7 +970,7 @@ static void ath11k_pci_remove(struct pci_dev *pdev) ath11k_pci_set_irq_affinity_hint(ab_pci, NULL); if (test_bit(ATH11K_FLAG_QMI_FAIL, &ab->dev_flags)) { - ath11k_pci_power_down(ab, false); + ath11k_pci_power_down(ab); ath11k_debugfs_soc_destroy(ab); ath11k_qmi_deinit_service(ab); goto qmi_fail; @@ -998,7 +998,7 @@ static void ath11k_pci_shutdown(struct pci_dev *pdev) struct ath11k_pci *ab_pci = ath11k_pci_priv(ab); ath11k_pci_set_irq_affinity_hint(ab_pci, NULL); - ath11k_pci_power_down(ab, false); + ath11k_pci_power_down(ab); } static __maybe_unused int ath11k_pci_pm_suspend(struct device *dev) @@ -1035,39 +1035,9 @@ static __maybe_unused int ath11k_pci_pm_resume(struct device *dev) return ret; } -static __maybe_unused int ath11k_pci_pm_suspend_late(struct device *dev) -{ - struct ath11k_base *ab = dev_get_drvdata(dev); - int ret; - - ret = ath11k_core_suspend_late(ab); - if (ret) - ath11k_warn(ab, "failed to late suspend core: %d\n", ret); - - /* Similar to ath11k_pci_pm_suspend(), we return success here - * even error happens, to allow system suspend/hibernation survive. - */ - return 0; -} - -static __maybe_unused int ath11k_pci_pm_resume_early(struct device *dev) -{ - struct ath11k_base *ab = dev_get_drvdata(dev); - int ret; - - ret = ath11k_core_resume_early(ab); - if (ret) - ath11k_warn(ab, "failed to early resume core: %d\n", ret); - - return ret; -} - -static const struct dev_pm_ops __maybe_unused ath11k_pci_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(ath11k_pci_pm_suspend, - ath11k_pci_pm_resume) - SET_LATE_SYSTEM_SLEEP_PM_OPS(ath11k_pci_pm_suspend_late, - ath11k_pci_pm_resume_early) -}; +static SIMPLE_DEV_PM_OPS(ath11k_pci_pm_ops, + ath11k_pci_pm_suspend, + ath11k_pci_pm_resume); static struct pci_driver ath11k_pci_driver = { .name = "ath11k_pci", diff --git a/drivers/net/wireless/ath/ath11k/qmi.c b/drivers/net/wireless/ath/ath11k/qmi.c index 1bc648920ab6..f477afd325de 100644 --- a/drivers/net/wireless/ath/ath11k/qmi.c +++ b/drivers/net/wireless/ath/ath11k/qmi.c @@ -2877,7 +2877,7 @@ int ath11k_qmi_fwreset_from_cold_boot(struct ath11k_base *ab) } /* reset the firmware */ - ath11k_hif_power_down(ab, false); + ath11k_hif_power_down(ab); ath11k_hif_power_up(ab); ath11k_dbg(ab, ATH11K_DBG_QMI, "exit wait for cold boot done\n"); return 0; From 5e24db550bd6f484d2c7687ee488708260e1f84a Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Thu, 29 Aug 2024 15:03:19 +0300 Subject: [PATCH 31/61] net: ethernet: ti: am65-cpsw: fix XDP_DROP, XDP_TX and XDP_REDIRECT The following XDP_DROP test from [1] stalls the interface after 250 packets. ~# xdb-bench drop -m native eth0 This is because new RX requests are never queued. Fix that. The below XDP_TX test from [1] fails with a warning [ 499.947381] XDP_WARN: xdp_update_frame_from_buff(line:277): Driver BUG: missing reserved tailroom ~# xdb-bench tx -m native eth0 Fix that by using PAGE_SIZE during xdp_init_buf(). In XDP_REDIRECT case only 1 packet was processed in rx_poll. Fix it to process up to budget packets. Fix all XDP error cases to call trace_xdp_exception() and drop the packet in am65_cpsw_run_xdp(). [1] xdp-tools suite https://github.com/xdp-project/xdp-tools Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Signed-off-by: Roger Quadros Reviewed-by: Jacob Keller Acked-by: Julien Panis Reviewed-by: MD Danish Anwar Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 62 +++++++++++++----------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 81d9f21086ec..9fd2ba26716c 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -156,12 +156,13 @@ #define AM65_CPSW_CPPI_TX_PKT_TYPE 0x7 /* XDP */ -#define AM65_CPSW_XDP_CONSUMED 2 -#define AM65_CPSW_XDP_REDIRECT 1 +#define AM65_CPSW_XDP_CONSUMED BIT(1) +#define AM65_CPSW_XDP_REDIRECT BIT(0) #define AM65_CPSW_XDP_PASS 0 /* Include headroom compatible with both skb and xdpf */ -#define AM65_CPSW_HEADROOM (max(NET_SKB_PAD, XDP_PACKET_HEADROOM) + NET_IP_ALIGN) +#define AM65_CPSW_HEADROOM_NA (max(NET_SKB_PAD, XDP_PACKET_HEADROOM) + NET_IP_ALIGN) +#define AM65_CPSW_HEADROOM ALIGN(AM65_CPSW_HEADROOM_NA, sizeof(long)) static void am65_cpsw_port_set_sl_mac(struct am65_cpsw_port *slave, const u8 *dev_addr) @@ -933,7 +934,7 @@ static int am65_cpsw_xdp_tx_frame(struct net_device *ndev, host_desc = k3_cppi_desc_pool_alloc(tx_chn->desc_pool); if (unlikely(!host_desc)) { ndev->stats.tx_dropped++; - return -ENOMEM; + return AM65_CPSW_XDP_CONSUMED; /* drop */ } am65_cpsw_nuss_set_buf_type(tx_chn, host_desc, buf_type); @@ -942,7 +943,7 @@ static int am65_cpsw_xdp_tx_frame(struct net_device *ndev, pkt_len, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(tx_chn->dma_dev, dma_buf))) { ndev->stats.tx_dropped++; - ret = -ENOMEM; + ret = AM65_CPSW_XDP_CONSUMED; /* drop */ goto pool_free; } @@ -977,6 +978,7 @@ static int am65_cpsw_xdp_tx_frame(struct net_device *ndev, /* Inform BQL */ netdev_tx_completed_queue(netif_txq, 1, pkt_len); ndev->stats.tx_errors++; + ret = AM65_CPSW_XDP_CONSUMED; /* drop */ goto dma_unmap; } @@ -1004,6 +1006,7 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_common *common, struct bpf_prog *prog; struct page *page; u32 act; + int err; prog = READ_ONCE(port->xdp_prog); if (!prog) @@ -1023,14 +1026,14 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_common *common, xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) - break; + goto drop; __netif_tx_lock(netif_txq, cpu); - ret = am65_cpsw_xdp_tx_frame(ndev, tx_chn, xdpf, + err = am65_cpsw_xdp_tx_frame(ndev, tx_chn, xdpf, AM65_CPSW_TX_BUF_TYPE_XDP_TX); __netif_tx_unlock(netif_txq); - if (ret) - break; + if (err) + goto drop; ndev->stats.rx_bytes += *len; ndev->stats.rx_packets++; @@ -1038,7 +1041,7 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_common *common, goto out; case XDP_REDIRECT: if (unlikely(xdp_do_redirect(ndev, xdp, prog))) - break; + goto drop; ndev->stats.rx_bytes += *len; ndev->stats.rx_packets++; @@ -1048,6 +1051,7 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_common *common, bpf_warn_invalid_xdp_action(ndev, prog, act); fallthrough; case XDP_ABORTED: +drop: trace_xdp_exception(ndev, prog, act); fallthrough; case XDP_DROP: @@ -1056,7 +1060,6 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_common *common, page = virt_to_head_page(xdp->data); am65_cpsw_put_page(rx_chn, page, true, desc_idx); - out: return ret; } @@ -1095,7 +1098,7 @@ static void am65_cpsw_nuss_rx_csum(struct sk_buff *skb, u32 csum_info) } static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_common *common, - u32 flow_idx, int cpu) + u32 flow_idx, int cpu, int *xdp_state) { struct am65_cpsw_rx_chn *rx_chn = &common->rx_chns; u32 buf_dma_len, pkt_len, port_id = 0, csum_info; @@ -1114,6 +1117,7 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_common *common, void **swdata; u32 *psdata; + *xdp_state = AM65_CPSW_XDP_PASS; ret = k3_udma_glue_pop_rx_chn(rx_chn->rx_chn, flow_idx, &desc_dma); if (ret) { if (ret != -ENODATA) @@ -1161,15 +1165,13 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_common *common, } if (port->xdp_prog) { - xdp_init_buff(&xdp, AM65_CPSW_MAX_PACKET_SIZE, &port->xdp_rxq); - - xdp_prepare_buff(&xdp, page_addr, skb_headroom(skb), + xdp_init_buff(&xdp, PAGE_SIZE, &port->xdp_rxq); + xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM, pkt_len, false); - - ret = am65_cpsw_run_xdp(common, port, &xdp, desc_idx, - cpu, &pkt_len); - if (ret != AM65_CPSW_XDP_PASS) - return ret; + *xdp_state = am65_cpsw_run_xdp(common, port, &xdp, desc_idx, + cpu, &pkt_len); + if (*xdp_state != AM65_CPSW_XDP_PASS) + goto allocate; /* Compute additional headroom to be reserved */ headroom = (xdp.data - xdp.data_hard_start) - skb_headroom(skb); @@ -1193,9 +1195,13 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_common *common, stats->rx_bytes += pkt_len; u64_stats_update_end(&stats->syncp); +allocate: new_page = page_pool_dev_alloc_pages(rx_chn->page_pool); - if (unlikely(!new_page)) + if (unlikely(!new_page)) { + dev_err(dev, "page alloc failed\n"); return -ENOMEM; + } + rx_chn->pages[desc_idx] = new_page; if (netif_dormant(ndev)) { @@ -1229,8 +1235,9 @@ static int am65_cpsw_nuss_rx_poll(struct napi_struct *napi_rx, int budget) struct am65_cpsw_common *common = am65_cpsw_napi_to_common(napi_rx); int flow = AM65_CPSW_MAX_RX_FLOWS; int cpu = smp_processor_id(); - bool xdp_redirect = false; + int xdp_state_or = 0; int cur_budget, ret; + int xdp_state; int num_rx = 0; /* process every flow */ @@ -1238,12 +1245,11 @@ static int am65_cpsw_nuss_rx_poll(struct napi_struct *napi_rx, int budget) cur_budget = budget - num_rx; while (cur_budget--) { - ret = am65_cpsw_nuss_rx_packets(common, flow, cpu); - if (ret) { - if (ret == AM65_CPSW_XDP_REDIRECT) - xdp_redirect = true; + ret = am65_cpsw_nuss_rx_packets(common, flow, cpu, + &xdp_state); + xdp_state_or |= xdp_state; + if (ret) break; - } num_rx++; } @@ -1251,7 +1257,7 @@ static int am65_cpsw_nuss_rx_poll(struct napi_struct *napi_rx, int budget) break; } - if (xdp_redirect) + if (xdp_state_or & AM65_CPSW_XDP_REDIRECT) xdp_do_flush(); dev_dbg(common->dev, "%s num_rx:%d %d\n", __func__, num_rx, budget); From 0a50c35277f96481a5a6ed5faf347f282040c57d Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Thu, 29 Aug 2024 15:03:20 +0300 Subject: [PATCH 32/61] net: ethernet: ti: am65-cpsw: Fix NULL dereference on XDP_TX If number of TX queues are set to 1 we get a NULL pointer dereference during XDP_TX. ~# ethtool -L eth0 tx 1 ~# ./xdp-trafficgen udp -A -a eth0 -t 2 Transmitting on eth0 (ifindex 2) [ 241.135257] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000030 Fix this by using actual TX queues instead of max TX queues when picking the TX channel in am65_cpsw_ndo_xdp_xmit(). Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Signed-off-by: Roger Quadros Reviewed-by: Jacob Keller Acked-by: Julien Panis Reviewed-by: MD Danish Anwar Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 9fd2ba26716c..03577a008df2 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1924,12 +1924,13 @@ static int am65_cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf) static int am65_cpsw_ndo_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames, u32 flags) { + struct am65_cpsw_common *common = am65_ndev_to_common(ndev); struct am65_cpsw_tx_chn *tx_chn; struct netdev_queue *netif_txq; int cpu = smp_processor_id(); int i, nxmit = 0; - tx_chn = &am65_ndev_to_common(ndev)->tx_chns[cpu % AM65_CPSW_MAX_TX_QUEUES]; + tx_chn = &common->tx_chns[cpu % common->tx_ch_num]; netif_txq = netdev_get_tx_queue(ndev, tx_chn->id); __netif_tx_lock(netif_txq, cpu); From 624d3291484f9cada10660f820db926c0bce7741 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Thu, 29 Aug 2024 15:03:21 +0300 Subject: [PATCH 33/61] net: ethernet: ti: am65-cpsw: Fix RX statistics for XDP_TX and XDP_REDIRECT We are not using ndev->stats for rx_packets and rx_bytes anymore. Instead, we use per CPU stats which are collated in am65_cpsw_nuss_ndo_get_stats(). Fix RX statistics for XDP_TX and XDP_REDIRECT cases. Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Signed-off-by: Roger Quadros Reviewed-by: Jacob Keller Acked-by: Julien Panis Reviewed-by: MD Danish Anwar Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 03577a008df2..b06b8872b4eb 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -998,7 +998,9 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_common *common, int desc_idx, int cpu, int *len) { struct am65_cpsw_rx_chn *rx_chn = &common->rx_chns; + struct am65_cpsw_ndev_priv *ndev_priv; struct net_device *ndev = port->ndev; + struct am65_cpsw_ndev_stats *stats; int ret = AM65_CPSW_XDP_CONSUMED; struct am65_cpsw_tx_chn *tx_chn; struct netdev_queue *netif_txq; @@ -1016,6 +1018,9 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_common *common, /* XDP prog might have changed packet data and boundaries */ *len = xdp->data_end - xdp->data; + ndev_priv = netdev_priv(ndev); + stats = this_cpu_ptr(ndev_priv->stats); + switch (act) { case XDP_PASS: ret = AM65_CPSW_XDP_PASS; @@ -1035,16 +1040,20 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_common *common, if (err) goto drop; - ndev->stats.rx_bytes += *len; - ndev->stats.rx_packets++; + u64_stats_update_begin(&stats->syncp); + stats->rx_bytes += *len; + stats->rx_packets++; + u64_stats_update_end(&stats->syncp); ret = AM65_CPSW_XDP_CONSUMED; goto out; case XDP_REDIRECT: if (unlikely(xdp_do_redirect(ndev, xdp, prog))) goto drop; - ndev->stats.rx_bytes += *len; - ndev->stats.rx_packets++; + u64_stats_update_begin(&stats->syncp); + stats->rx_bytes += *len; + stats->rx_packets++; + u64_stats_update_end(&stats->syncp); ret = AM65_CPSW_XDP_REDIRECT; goto out; default: From 2560db6ede1aaf162a73b2df43e0b6c5ed8819f7 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 30 Aug 2024 10:20:25 +0800 Subject: [PATCH 34/61] net: phy: Fix missing of_node_put() for leds The call of of_get_child_by_name() will cause refcount incremented for leds, if it succeeds, it should call of_node_put() to decrease it, fix it. Fixes: 01e5b728e9e4 ("net: phy: Add a binding for PHY LEDs") Reviewed-by: Jonathan Cameron Signed-off-by: Jinjie Ruan Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20240830022025.610844-1-ruanjinjie@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/phy/phy_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 7752e9386b40..6bb2793de0a9 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3347,11 +3347,13 @@ static int of_phy_leds(struct phy_device *phydev) err = of_phy_led(phydev, led); if (err) { of_node_put(led); + of_node_put(leds); phy_leds_unregister(phydev); return err; } } + of_node_put(leds); return 0; } From d7875b4b078f7e2d862e88aed99c3ea0381aa189 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 29 Aug 2024 11:36:01 -0700 Subject: [PATCH 35/61] ptp: ocp: convert serial ports to array Simplify serial port management code by using array of ports and helpers to get the name of the port. This change is needed to make the next patch simplier. Signed-off-by: Vadim Fedorenko Reviewed-by: Greg Kroah-Hartman Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_ocp.c | 120 ++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 63 deletions(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index ee2ced88ab34..46369de8e30b 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -316,6 +316,15 @@ struct ptp_ocp_serial_port { #define OCP_SERIAL_LEN 6 #define OCP_SMA_NUM 4 +enum { + PORT_GNSS, + PORT_GNSS2, + PORT_MAC, /* miniature atomic clock */ + PORT_NMEA, + + __PORT_COUNT, +}; + struct ptp_ocp { struct pci_dev *pdev; struct device dev; @@ -357,10 +366,7 @@ struct ptp_ocp { struct delayed_work sync_work; int id; int n_irqs; - struct ptp_ocp_serial_port gnss_port; - struct ptp_ocp_serial_port gnss2_port; - struct ptp_ocp_serial_port mac_port; /* miniature atomic clock */ - struct ptp_ocp_serial_port nmea_port; + struct ptp_ocp_serial_port port[__PORT_COUNT]; bool fw_loader; u8 fw_tag; u16 fw_version; @@ -655,28 +661,28 @@ static struct ocp_resource ocp_fb_resource[] = { }, }, { - OCP_SERIAL_RESOURCE(gnss_port), + OCP_SERIAL_RESOURCE(port[PORT_GNSS]), .offset = 0x00160000 + 0x1000, .irq_vec = 3, .extra = &(struct ptp_ocp_serial_port) { .baud = 115200, }, }, { - OCP_SERIAL_RESOURCE(gnss2_port), + OCP_SERIAL_RESOURCE(port[PORT_GNSS2]), .offset = 0x00170000 + 0x1000, .irq_vec = 4, .extra = &(struct ptp_ocp_serial_port) { .baud = 115200, }, }, { - OCP_SERIAL_RESOURCE(mac_port), + OCP_SERIAL_RESOURCE(port[PORT_MAC]), .offset = 0x00180000 + 0x1000, .irq_vec = 5, .extra = &(struct ptp_ocp_serial_port) { .baud = 57600, }, }, { - OCP_SERIAL_RESOURCE(nmea_port), + OCP_SERIAL_RESOURCE(port[PORT_NMEA]), .offset = 0x00190000 + 0x1000, .irq_vec = 10, }, { @@ -740,7 +746,7 @@ static struct ocp_resource ocp_art_resource[] = { .offset = 0x01000000, .size = 0x10000, }, { - OCP_SERIAL_RESOURCE(gnss_port), + OCP_SERIAL_RESOURCE(port[PORT_GNSS]), .offset = 0x00160000 + 0x1000, .irq_vec = 3, .extra = &(struct ptp_ocp_serial_port) { .baud = 115200, @@ -839,7 +845,7 @@ static struct ocp_resource ocp_art_resource[] = { }, }, { - OCP_SERIAL_RESOURCE(mac_port), + OCP_SERIAL_RESOURCE(port[PORT_MAC]), .offset = 0x00190000, .irq_vec = 7, .extra = &(struct ptp_ocp_serial_port) { .baud = 9600, @@ -950,14 +956,14 @@ static struct ocp_resource ocp_adva_resource[] = { .offset = 0x00220000, .size = 0x1000, }, { - OCP_SERIAL_RESOURCE(gnss_port), + OCP_SERIAL_RESOURCE(port[PORT_GNSS]), .offset = 0x00160000 + 0x1000, .irq_vec = 3, .extra = &(struct ptp_ocp_serial_port) { .baud = 9600, }, }, { - OCP_SERIAL_RESOURCE(mac_port), + OCP_SERIAL_RESOURCE(port[PORT_MAC]), .offset = 0x00180000 + 0x1000, .irq_vec = 5, .extra = &(struct ptp_ocp_serial_port) { .baud = 115200, @@ -1649,6 +1655,15 @@ ptp_ocp_tod_gnss_name(int idx) return gnss_name[idx]; } +static const char * +ptp_ocp_tty_port_name(int idx) +{ + static const char * const tty_name[] = { + "GNSS", "GNSS2", "MAC", "NMEA" + }; + return tty_name[idx]; +} + struct ptp_ocp_nvmem_match_info { struct ptp_ocp *bp; const void * const tag; @@ -3960,16 +3975,11 @@ ptp_ocp_summary_show(struct seq_file *s, void *data) bp = dev_get_drvdata(dev); seq_printf(s, "%7s: /dev/ptp%d\n", "PTP", ptp_clock_index(bp->ptp)); - if (bp->gnss_port.line != -1) - seq_printf(s, "%7s: /dev/ttyS%d\n", "GNSS1", - bp->gnss_port.line); - if (bp->gnss2_port.line != -1) - seq_printf(s, "%7s: /dev/ttyS%d\n", "GNSS2", - bp->gnss2_port.line); - if (bp->mac_port.line != -1) - seq_printf(s, "%7s: /dev/ttyS%d\n", "MAC", bp->mac_port.line); - if (bp->nmea_port.line != -1) - seq_printf(s, "%7s: /dev/ttyS%d\n", "NMEA", bp->nmea_port.line); + for (i = 0; i < __PORT_COUNT; i++) { + if (bp->port[i].line != -1) + seq_printf(s, "%7s: /dev/ttyS%d\n", ptp_ocp_tty_port_name(i), + bp->port[i].line); + } memset(sma_val, 0xff, sizeof(sma_val)); if (bp->sma_map1) { @@ -4279,7 +4289,7 @@ ptp_ocp_dev_release(struct device *dev) static int ptp_ocp_device_init(struct ptp_ocp *bp, struct pci_dev *pdev) { - int err; + int i, err; mutex_lock(&ptp_ocp_lock); err = idr_alloc(&ptp_ocp_idr, bp, 0, 0, GFP_KERNEL); @@ -4292,10 +4302,10 @@ ptp_ocp_device_init(struct ptp_ocp *bp, struct pci_dev *pdev) bp->ptp_info = ptp_ocp_clock_info; spin_lock_init(&bp->lock); - bp->gnss_port.line = -1; - bp->gnss2_port.line = -1; - bp->mac_port.line = -1; - bp->nmea_port.line = -1; + + for (i = 0; i < __PORT_COUNT; i++) + bp->port[i].line = -1; + bp->pdev = pdev; device_initialize(&bp->dev); @@ -4351,23 +4361,15 @@ ptp_ocp_complete(struct ptp_ocp *bp) { struct pps_device *pps; char buf[32]; + int i; - if (bp->gnss_port.line != -1) { - sprintf(buf, "ttyS%d", bp->gnss_port.line); - ptp_ocp_link_child(bp, buf, "ttyGNSS"); - } - if (bp->gnss2_port.line != -1) { - sprintf(buf, "ttyS%d", bp->gnss2_port.line); - ptp_ocp_link_child(bp, buf, "ttyGNSS2"); - } - if (bp->mac_port.line != -1) { - sprintf(buf, "ttyS%d", bp->mac_port.line); - ptp_ocp_link_child(bp, buf, "ttyMAC"); - } - if (bp->nmea_port.line != -1) { - sprintf(buf, "ttyS%d", bp->nmea_port.line); - ptp_ocp_link_child(bp, buf, "ttyNMEA"); + for (i = 0; i < __PORT_COUNT; i++) { + if (bp->port[i].line != -1) { + sprintf(buf, "ttyS%d", bp->port[i].line); + ptp_ocp_link_child(bp, buf, ptp_ocp_tty_port_name(i)); + } } + sprintf(buf, "ptp%d", ptp_clock_index(bp->ptp)); ptp_ocp_link_child(bp, buf, "ptp"); @@ -4416,23 +4418,20 @@ ptp_ocp_info(struct ptp_ocp *bp) }; struct device *dev = &bp->pdev->dev; u32 reg; + int i; ptp_ocp_phc_info(bp); - ptp_ocp_serial_info(dev, "GNSS", bp->gnss_port.line, - bp->gnss_port.baud); - ptp_ocp_serial_info(dev, "GNSS2", bp->gnss2_port.line, - bp->gnss2_port.baud); - ptp_ocp_serial_info(dev, "MAC", bp->mac_port.line, bp->mac_port.baud); - if (bp->nmea_out && bp->nmea_port.line != -1) { - bp->nmea_port.baud = -1; + for (i = 0; i < __PORT_COUNT; i++) { + if (i == PORT_NMEA && bp->nmea_out && bp->port[PORT_NMEA].line != -1) { + bp->port[PORT_NMEA].baud = -1; - reg = ioread32(&bp->nmea_out->uart_baud); - if (reg < ARRAY_SIZE(nmea_baud)) - bp->nmea_port.baud = nmea_baud[reg]; - - ptp_ocp_serial_info(dev, "NMEA", bp->nmea_port.line, - bp->nmea_port.baud); + reg = ioread32(&bp->nmea_out->uart_baud); + if (reg < ARRAY_SIZE(nmea_baud)) + bp->port[PORT_NMEA].baud = nmea_baud[reg]; + } + ptp_ocp_serial_info(dev, ptp_ocp_tty_port_name(i), bp->port[i].line, + bp->port[i].baud); } } @@ -4473,14 +4472,9 @@ ptp_ocp_detach(struct ptp_ocp *bp) for (i = 0; i < 4; i++) if (bp->signal_out[i]) ptp_ocp_unregister_ext(bp->signal_out[i]); - if (bp->gnss_port.line != -1) - serial8250_unregister_port(bp->gnss_port.line); - if (bp->gnss2_port.line != -1) - serial8250_unregister_port(bp->gnss2_port.line); - if (bp->mac_port.line != -1) - serial8250_unregister_port(bp->mac_port.line); - if (bp->nmea_port.line != -1) - serial8250_unregister_port(bp->nmea_port.line); + for (i = 0; i < __PORT_COUNT; i++) + if (bp->port[i].line != -1) + serial8250_unregister_port(bp->port[i].line); platform_device_unregister(bp->spi_flash); platform_device_unregister(bp->i2c_ctrl); if (bp->i2c_clk) From 82ace0c8fe9b025eaa273365e27057402cdaeb02 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 29 Aug 2024 11:36:02 -0700 Subject: [PATCH 36/61] ptp: ocp: adjust sysfs entries to expose tty information Implement additional attribute group to expose serial port information. Fixes tag points to the commit which introduced the change in serial port subsystem and made it impossible to use symlinks. Fixes: b286f4e87e32 ("serial: core: Move tty and serdev to be children of serial core port device") Signed-off-by: Vadim Fedorenko Reviewed-by: Greg Kroah-Hartman Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_ocp.c | 62 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index 46369de8e30b..e7479b9b90cb 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -3361,6 +3361,54 @@ static EXT_ATTR_RO(freq, frequency, 1); static EXT_ATTR_RO(freq, frequency, 2); static EXT_ATTR_RO(freq, frequency, 3); +static ssize_t +ptp_ocp_tty_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct dev_ext_attribute *ea = to_ext_attr(attr); + struct ptp_ocp *bp = dev_get_drvdata(dev); + + return sysfs_emit(buf, "ttyS%d", bp->port[(uintptr_t)ea->var].line); +} + +static umode_t +ptp_ocp_timecard_tty_is_visible(struct kobject *kobj, struct attribute *attr, int n) +{ + struct ptp_ocp *bp = dev_get_drvdata(kobj_to_dev(kobj)); + struct ptp_ocp_serial_port *port; + struct device_attribute *dattr; + struct dev_ext_attribute *ea; + + if (strncmp(attr->name, "tty", 3)) + return attr->mode; + + dattr = container_of(attr, struct device_attribute, attr); + ea = container_of(dattr, struct dev_ext_attribute, attr); + port = &bp->port[(uintptr_t)ea->var]; + return port->line == -1 ? 0 : 0444; +} + +#define EXT_TTY_ATTR_RO(_name, _val) \ + struct dev_ext_attribute dev_attr_tty##_name = \ + { __ATTR(tty##_name, 0444, ptp_ocp_tty_show, NULL), (void *)_val } + +static EXT_TTY_ATTR_RO(GNSS, PORT_GNSS); +static EXT_TTY_ATTR_RO(GNSS2, PORT_GNSS2); +static EXT_TTY_ATTR_RO(MAC, PORT_MAC); +static EXT_TTY_ATTR_RO(NMEA, PORT_NMEA); +static struct attribute *ptp_ocp_timecard_tty_attrs[] = { + &dev_attr_ttyGNSS.attr.attr, + &dev_attr_ttyGNSS2.attr.attr, + &dev_attr_ttyMAC.attr.attr, + &dev_attr_ttyNMEA.attr.attr, + NULL, +}; + +static const struct attribute_group ptp_ocp_timecard_tty_group = { + .name = "tty", + .attrs = ptp_ocp_timecard_tty_attrs, + .is_visible = ptp_ocp_timecard_tty_is_visible, +}; + static ssize_t serialnum_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -3790,6 +3838,7 @@ static const struct attribute_group fb_timecard_group = { static const struct ocp_attr_group fb_timecard_groups[] = { { .cap = OCP_CAP_BASIC, .group = &fb_timecard_group }, + { .cap = OCP_CAP_BASIC, .group = &ptp_ocp_timecard_tty_group }, { .cap = OCP_CAP_SIGNAL, .group = &fb_timecard_signal0_group }, { .cap = OCP_CAP_SIGNAL, .group = &fb_timecard_signal1_group }, { .cap = OCP_CAP_SIGNAL, .group = &fb_timecard_signal2_group }, @@ -3829,6 +3878,7 @@ static const struct attribute_group art_timecard_group = { static const struct ocp_attr_group art_timecard_groups[] = { { .cap = OCP_CAP_BASIC, .group = &art_timecard_group }, + { .cap = OCP_CAP_BASIC, .group = &ptp_ocp_timecard_tty_group }, { }, }; @@ -3856,6 +3906,7 @@ static const struct attribute_group adva_timecard_group = { static const struct ocp_attr_group adva_timecard_groups[] = { { .cap = OCP_CAP_BASIC, .group = &adva_timecard_group }, + { .cap = OCP_CAP_BASIC, .group = &ptp_ocp_timecard_tty_group }, { .cap = OCP_CAP_SIGNAL, .group = &fb_timecard_signal0_group }, { .cap = OCP_CAP_SIGNAL, .group = &fb_timecard_signal1_group }, { .cap = OCP_CAP_FREQ, .group = &fb_timecard_freq0_group }, @@ -4361,14 +4412,6 @@ ptp_ocp_complete(struct ptp_ocp *bp) { struct pps_device *pps; char buf[32]; - int i; - - for (i = 0; i < __PORT_COUNT; i++) { - if (bp->port[i].line != -1) { - sprintf(buf, "ttyS%d", bp->port[i].line); - ptp_ocp_link_child(bp, buf, ptp_ocp_tty_port_name(i)); - } - } sprintf(buf, "ptp%d", ptp_clock_index(bp->ptp)); ptp_ocp_link_child(bp, buf, "ptp"); @@ -4440,9 +4483,6 @@ ptp_ocp_detach_sysfs(struct ptp_ocp *bp) { struct device *dev = &bp->dev; - sysfs_remove_link(&dev->kobj, "ttyGNSS"); - sysfs_remove_link(&dev->kobj, "ttyGNSS2"); - sysfs_remove_link(&dev->kobj, "ttyMAC"); sysfs_remove_link(&dev->kobj, "ptp"); sysfs_remove_link(&dev->kobj, "pps"); } From 40bec579d4c718dabc3e3baf7d84c93a89e6bcce Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 29 Aug 2024 11:36:03 -0700 Subject: [PATCH 37/61] docs: ABI: update OCP TimeCard sysfs entries Update documentation according to the changes in the driver. New attributes group tty is exposed and ttyGNSS, ttyGNSS2, ttyMAC and ttyNMEA are moved to this group. Also, these attributes are no more links to the devices but rather simple text files containing names of tty devices. Signed-off-by: Vadim Fedorenko Reviewed-by: Greg Kroah-Hartman Signed-off-by: Paolo Abeni --- Documentation/ABI/testing/sysfs-timecard | 33 ++++++++++++++---------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-timecard b/Documentation/ABI/testing/sysfs-timecard index 220478156297..3ae41b7634ac 100644 --- a/Documentation/ABI/testing/sysfs-timecard +++ b/Documentation/ABI/testing/sysfs-timecard @@ -258,24 +258,29 @@ Description: (RW) When retrieving the PHC with the PTP SYS_OFFSET_EXTENDED the estimated point where the FPGA latches the PHC time. This value may be changed by writing an unsigned integer. -What: /sys/class/timecard/ocpN/ttyGNSS -What: /sys/class/timecard/ocpN/ttyGNSS2 -Date: September 2021 -Contact: Jonathan Lemon -Description: These optional attributes link to the TTY serial ports - associated with the GNSS devices. +What: /sys/class/timecard/ocpN/tty +Date: August 2024 +Contact: Vadim Fedorenko +Description: (RO) Directory containing the sysfs nodes for TTY attributes -What: /sys/class/timecard/ocpN/ttyMAC -Date: September 2021 +What: /sys/class/timecard/ocpN/tty/ttyGNSS +What: /sys/class/timecard/ocpN/tty/ttyGNSS2 +Date: August 2024 Contact: Jonathan Lemon -Description: This optional attribute links to the TTY serial port - associated with the Miniature Atomic Clock. +Description: (RO) These optional attributes contain names of the TTY serial + ports associated with the GNSS devices. -What: /sys/class/timecard/ocpN/ttyNMEA -Date: September 2021 +What: /sys/class/timecard/ocpN/tty/ttyMAC +Date: August 2024 Contact: Jonathan Lemon -Description: This optional attribute links to the TTY serial port - which outputs the PHC time in NMEA ZDA format. +Description: (RO) This optional attribute contains name of the TTY serial + port associated with the Miniature Atomic Clock. + +What: /sys/class/timecard/ocpN/tty/ttyNMEA +Date: August 2024 +Contact: Jonathan Lemon +Description: (RO) This optional attribute contains name of the TTY serial + port which outputs the PHC time in NMEA ZDA format. What: /sys/class/timecard/ocpN/utc_tai_offset Date: September 2021 From 2a5dc090b92cfa5270e20056074241c6db5c9cdd Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 23 Aug 2024 11:59:26 +0200 Subject: [PATCH 38/61] ice: move netif_queue_set_napi to rtnl-protected sections Currently, netif_queue_set_napi() is called from ice_vsi_rebuild() that is not rtnl-locked when called from the reset. This creates the need to take the rtnl_lock just for a single function and complicates the synchronization with .ndo_bpf. At the same time, there no actual need to fill napi-to-queue information at this exact point. Fill napi-to-queue information when opening the VSI and clear it when the VSI is being closed. Those routines are already rtnl-locked. Also, rewrite napi-to-queue assignment in a way that prevents inclusion of XDP queues, as this leads to out-of-bounds writes, such as one below. [ +0.000004] BUG: KASAN: slab-out-of-bounds in netif_queue_set_napi+0x1c2/0x1e0 [ +0.000012] Write of size 8 at addr ffff889881727c80 by task bash/7047 [ +0.000006] CPU: 24 PID: 7047 Comm: bash Not tainted 6.10.0-rc2+ #2 [ +0.000004] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0014.082620210524 08/26/2021 [ +0.000003] Call Trace: [ +0.000003] [ +0.000002] dump_stack_lvl+0x60/0x80 [ +0.000007] print_report+0xce/0x630 [ +0.000007] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ +0.000007] ? __virt_addr_valid+0x1c9/0x2c0 [ +0.000005] ? netif_queue_set_napi+0x1c2/0x1e0 [ +0.000003] kasan_report+0xe9/0x120 [ +0.000004] ? netif_queue_set_napi+0x1c2/0x1e0 [ +0.000004] netif_queue_set_napi+0x1c2/0x1e0 [ +0.000005] ice_vsi_close+0x161/0x670 [ice] [ +0.000114] ice_dis_vsi+0x22f/0x270 [ice] [ +0.000095] ice_pf_dis_all_vsi.constprop.0+0xae/0x1c0 [ice] [ +0.000086] ice_prepare_for_reset+0x299/0x750 [ice] [ +0.000087] pci_dev_save_and_disable+0x82/0xd0 [ +0.000006] pci_reset_function+0x12d/0x230 [ +0.000004] reset_store+0xa0/0x100 [ +0.000006] ? __pfx_reset_store+0x10/0x10 [ +0.000002] ? __pfx_mutex_lock+0x10/0x10 [ +0.000004] ? __check_object_size+0x4c1/0x640 [ +0.000007] kernfs_fop_write_iter+0x30b/0x4a0 [ +0.000006] vfs_write+0x5d6/0xdf0 [ +0.000005] ? fd_install+0x180/0x350 [ +0.000005] ? __pfx_vfs_write+0x10/0xA10 [ +0.000004] ? do_fcntl+0x52c/0xcd0 [ +0.000004] ? kasan_save_track+0x13/0x60 [ +0.000003] ? kasan_save_free_info+0x37/0x60 [ +0.000006] ksys_write+0xfa/0x1d0 [ +0.000003] ? __pfx_ksys_write+0x10/0x10 [ +0.000002] ? __x64_sys_fcntl+0x121/0x180 [ +0.000004] ? _raw_spin_lock+0x87/0xe0 [ +0.000005] do_syscall_64+0x80/0x170 [ +0.000007] ? _raw_spin_lock+0x87/0xe0 [ +0.000004] ? __pfx__raw_spin_lock+0x10/0x10 [ +0.000003] ? file_close_fd_locked+0x167/0x230 [ +0.000005] ? syscall_exit_to_user_mode+0x7d/0x220 [ +0.000005] ? do_syscall_64+0x8c/0x170 [ +0.000004] ? do_syscall_64+0x8c/0x170 [ +0.000003] ? do_syscall_64+0x8c/0x170 [ +0.000003] ? fput+0x1a/0x2c0 [ +0.000004] ? filp_close+0x19/0x30 [ +0.000004] ? do_dup2+0x25a/0x4c0 [ +0.000004] ? __x64_sys_dup2+0x6e/0x2e0 [ +0.000002] ? syscall_exit_to_user_mode+0x7d/0x220 [ +0.000004] ? do_syscall_64+0x8c/0x170 [ +0.000003] ? __count_memcg_events+0x113/0x380 [ +0.000005] ? handle_mm_fault+0x136/0x820 [ +0.000005] ? do_user_addr_fault+0x444/0xa80 [ +0.000004] ? clear_bhb_loop+0x25/0x80 [ +0.000004] ? clear_bhb_loop+0x25/0x80 [ +0.000002] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ +0.000005] RIP: 0033:0x7f2033593154 Fixes: 080b0c8d6d26 ("ice: Fix ASSERT_RTNL() warning during certain scenarios") Fixes: 91fdbce7e8d6 ("ice: Add support in the driver for associating queue with napi") Reviewed-by: Wojciech Drewek Reviewed-by: Jacob Keller Reviewed-by: Amritha Nambiar Signed-off-by: Larysa Zaremba Reviewed-by: Maciej Fijalkowski Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_base.c | 11 +- drivers/net/ethernet/intel/ice/ice_lib.c | 151 ++++++---------------- drivers/net/ethernet/intel/ice/ice_lib.h | 10 +- drivers/net/ethernet/intel/ice/ice_main.c | 17 ++- 4 files changed, 60 insertions(+), 129 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index f448d3a84564..c158749a80e0 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -190,16 +190,11 @@ static void ice_free_q_vector(struct ice_vsi *vsi, int v_idx) } q_vector = vsi->q_vectors[v_idx]; - ice_for_each_tx_ring(tx_ring, q_vector->tx) { - ice_queue_set_napi(vsi, tx_ring->q_index, NETDEV_QUEUE_TYPE_TX, - NULL); + ice_for_each_tx_ring(tx_ring, vsi->q_vectors[v_idx]->tx) tx_ring->q_vector = NULL; - } - ice_for_each_rx_ring(rx_ring, q_vector->rx) { - ice_queue_set_napi(vsi, rx_ring->q_index, NETDEV_QUEUE_TYPE_RX, - NULL); + + ice_for_each_rx_ring(rx_ring, vsi->q_vectors[v_idx]->rx) rx_ring->q_vector = NULL; - } /* only VSI with an associated netdev is set up with NAPI */ if (vsi->netdev) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index f559e60992fa..6676596df88b 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2286,9 +2286,6 @@ static int ice_vsi_cfg_def(struct ice_vsi *vsi) ice_vsi_map_rings_to_vectors(vsi); - /* Associate q_vector rings to napi */ - ice_vsi_set_napi_queues(vsi); - vsi->stat_offsets_loaded = false; /* ICE_VSI_CTRL does not need RSS so skip RSS processing */ @@ -2628,6 +2625,7 @@ void ice_vsi_close(struct ice_vsi *vsi) if (!test_and_set_bit(ICE_VSI_DOWN, vsi->state)) ice_down(vsi); + ice_vsi_clear_napi_queues(vsi); ice_vsi_free_irq(vsi); ice_vsi_free_tx_rings(vsi); ice_vsi_free_rx_rings(vsi); @@ -2694,120 +2692,55 @@ void ice_dis_vsi(struct ice_vsi *vsi, bool locked) } /** - * __ice_queue_set_napi - Set the napi instance for the queue - * @dev: device to which NAPI and queue belong - * @queue_index: Index of queue - * @type: queue type as RX or TX - * @napi: NAPI context - * @locked: is the rtnl_lock already held - * - * Set the napi instance for the queue. Caller indicates the lock status. - */ -static void -__ice_queue_set_napi(struct net_device *dev, unsigned int queue_index, - enum netdev_queue_type type, struct napi_struct *napi, - bool locked) -{ - if (!locked) - rtnl_lock(); - netif_queue_set_napi(dev, queue_index, type, napi); - if (!locked) - rtnl_unlock(); -} - -/** - * ice_queue_set_napi - Set the napi instance for the queue - * @vsi: VSI being configured - * @queue_index: Index of queue - * @type: queue type as RX or TX - * @napi: NAPI context - * - * Set the napi instance for the queue. The rtnl lock state is derived from the - * execution path. - */ -void -ice_queue_set_napi(struct ice_vsi *vsi, unsigned int queue_index, - enum netdev_queue_type type, struct napi_struct *napi) -{ - struct ice_pf *pf = vsi->back; - - if (!vsi->netdev) - return; - - if (current_work() == &pf->serv_task || - test_bit(ICE_PREPARED_FOR_RESET, pf->state) || - test_bit(ICE_DOWN, pf->state) || - test_bit(ICE_SUSPENDED, pf->state)) - __ice_queue_set_napi(vsi->netdev, queue_index, type, napi, - false); - else - __ice_queue_set_napi(vsi->netdev, queue_index, type, napi, - true); -} - -/** - * __ice_q_vector_set_napi_queues - Map queue[s] associated with the napi - * @q_vector: q_vector pointer - * @locked: is the rtnl_lock already held - * - * Associate the q_vector napi with all the queue[s] on the vector. - * Caller indicates the lock status. - */ -void __ice_q_vector_set_napi_queues(struct ice_q_vector *q_vector, bool locked) -{ - struct ice_rx_ring *rx_ring; - struct ice_tx_ring *tx_ring; - - ice_for_each_rx_ring(rx_ring, q_vector->rx) - __ice_queue_set_napi(q_vector->vsi->netdev, rx_ring->q_index, - NETDEV_QUEUE_TYPE_RX, &q_vector->napi, - locked); - - ice_for_each_tx_ring(tx_ring, q_vector->tx) - __ice_queue_set_napi(q_vector->vsi->netdev, tx_ring->q_index, - NETDEV_QUEUE_TYPE_TX, &q_vector->napi, - locked); - /* Also set the interrupt number for the NAPI */ - netif_napi_set_irq(&q_vector->napi, q_vector->irq.virq); -} - -/** - * ice_q_vector_set_napi_queues - Map queue[s] associated with the napi - * @q_vector: q_vector pointer - * - * Associate the q_vector napi with all the queue[s] on the vector - */ -void ice_q_vector_set_napi_queues(struct ice_q_vector *q_vector) -{ - struct ice_rx_ring *rx_ring; - struct ice_tx_ring *tx_ring; - - ice_for_each_rx_ring(rx_ring, q_vector->rx) - ice_queue_set_napi(q_vector->vsi, rx_ring->q_index, - NETDEV_QUEUE_TYPE_RX, &q_vector->napi); - - ice_for_each_tx_ring(tx_ring, q_vector->tx) - ice_queue_set_napi(q_vector->vsi, tx_ring->q_index, - NETDEV_QUEUE_TYPE_TX, &q_vector->napi); - /* Also set the interrupt number for the NAPI */ - netif_napi_set_irq(&q_vector->napi, q_vector->irq.virq); -} - -/** - * ice_vsi_set_napi_queues + * ice_vsi_set_napi_queues - associate netdev queues with napi * @vsi: VSI pointer * - * Associate queue[s] with napi for all vectors + * Associate queue[s] with napi for all vectors. + * The caller must hold rtnl_lock. */ void ice_vsi_set_napi_queues(struct ice_vsi *vsi) { - int i; + struct net_device *netdev = vsi->netdev; + int q_idx, v_idx; - if (!vsi->netdev) + if (!netdev) return; - ice_for_each_q_vector(vsi, i) - ice_q_vector_set_napi_queues(vsi->q_vectors[i]); + ice_for_each_rxq(vsi, q_idx) + netif_queue_set_napi(netdev, q_idx, NETDEV_QUEUE_TYPE_RX, + &vsi->rx_rings[q_idx]->q_vector->napi); + + ice_for_each_txq(vsi, q_idx) + netif_queue_set_napi(netdev, q_idx, NETDEV_QUEUE_TYPE_TX, + &vsi->tx_rings[q_idx]->q_vector->napi); + /* Also set the interrupt number for the NAPI */ + ice_for_each_q_vector(vsi, v_idx) { + struct ice_q_vector *q_vector = vsi->q_vectors[v_idx]; + + netif_napi_set_irq(&q_vector->napi, q_vector->irq.virq); + } +} + +/** + * ice_vsi_clear_napi_queues - dissociate netdev queues from napi + * @vsi: VSI pointer + * + * Clear the association between all VSI queues queue[s] and napi. + * The caller must hold rtnl_lock. + */ +void ice_vsi_clear_napi_queues(struct ice_vsi *vsi) +{ + struct net_device *netdev = vsi->netdev; + int q_idx; + + if (!netdev) + return; + + ice_for_each_txq(vsi, q_idx) + netif_queue_set_napi(netdev, q_idx, NETDEV_QUEUE_TYPE_TX, NULL); + + ice_for_each_rxq(vsi, q_idx) + netif_queue_set_napi(netdev, q_idx, NETDEV_QUEUE_TYPE_RX, NULL); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h index 94ce8964dda6..36d86535695d 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_lib.h @@ -44,16 +44,10 @@ void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc); struct ice_vsi * ice_vsi_setup(struct ice_pf *pf, struct ice_vsi_cfg_params *params); -void -ice_queue_set_napi(struct ice_vsi *vsi, unsigned int queue_index, - enum netdev_queue_type type, struct napi_struct *napi); - -void __ice_q_vector_set_napi_queues(struct ice_q_vector *q_vector, bool locked); - -void ice_q_vector_set_napi_queues(struct ice_q_vector *q_vector); - void ice_vsi_set_napi_queues(struct ice_vsi *vsi); +void ice_vsi_clear_napi_queues(struct ice_vsi *vsi); + int ice_vsi_release(struct ice_vsi *vsi); void ice_vsi_close(struct ice_vsi *vsi); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 46d3c5a34d6a..263833346d3a 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3558,11 +3558,9 @@ static void ice_napi_add(struct ice_vsi *vsi) if (!vsi->netdev) return; - ice_for_each_q_vector(vsi, v_idx) { + ice_for_each_q_vector(vsi, v_idx) netif_napi_add(vsi->netdev, &vsi->q_vectors[v_idx]->napi, ice_napi_poll); - __ice_q_vector_set_napi_queues(vsi->q_vectors[v_idx], false); - } } /** @@ -5540,7 +5538,9 @@ static int ice_reinit_interrupt_scheme(struct ice_pf *pf) if (ret) goto err_reinit; ice_vsi_map_rings_to_vectors(pf->vsi[v]); + rtnl_lock(); ice_vsi_set_napi_queues(pf->vsi[v]); + rtnl_unlock(); } ret = ice_req_irq_msix_misc(pf); @@ -5554,8 +5554,12 @@ static int ice_reinit_interrupt_scheme(struct ice_pf *pf) err_reinit: while (v--) - if (pf->vsi[v]) + if (pf->vsi[v]) { + rtnl_lock(); + ice_vsi_clear_napi_queues(pf->vsi[v]); + rtnl_unlock(); ice_vsi_free_q_vectors(pf->vsi[v]); + } return ret; } @@ -5620,6 +5624,9 @@ static int ice_suspend(struct device *dev) ice_for_each_vsi(pf, v) { if (!pf->vsi[v]) continue; + rtnl_lock(); + ice_vsi_clear_napi_queues(pf->vsi[v]); + rtnl_unlock(); ice_vsi_free_q_vectors(pf->vsi[v]); } ice_clear_interrupt_scheme(pf); @@ -7455,6 +7462,8 @@ int ice_vsi_open(struct ice_vsi *vsi) err = netif_set_real_num_rx_queues(vsi->netdev, vsi->num_rxq); if (err) goto err_set_qs; + + ice_vsi_set_napi_queues(vsi); } err = ice_up_complete(vsi); From 2504b8405768a57a71e660dbfd5abd59f679a03f Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 23 Aug 2024 11:59:27 +0200 Subject: [PATCH 39/61] ice: protect XDP configuration with a mutex The main threat to data consistency in ice_xdp() is a possible asynchronous PF reset. It can be triggered by a user or by TX timeout handler. XDP setup and PF reset code access the same resources in the following sections: * ice_vsi_close() in ice_prepare_for_reset() - already rtnl-locked * ice_vsi_rebuild() for the PF VSI - not protected * ice_vsi_open() - already rtnl-locked With an unfortunate timing, such accesses can result in a crash such as the one below: [ +1.999878] ice 0000:b1:00.0: Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring 14 [ +2.002992] ice 0000:b1:00.0: Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring 18 [Mar15 18:17] ice 0000:b1:00.0 ens801f0np0: NETDEV WATCHDOG: CPU: 38: transmit queue 14 timed out 80692736 ms [ +0.000093] ice 0000:b1:00.0 ens801f0np0: tx_timeout: VSI_num: 6, Q 14, NTC: 0x0, HW_HEAD: 0x0, NTU: 0x0, INT: 0x4000001 [ +0.000012] ice 0000:b1:00.0 ens801f0np0: tx_timeout recovery level 1, txqueue 14 [ +0.394718] ice 0000:b1:00.0: PTP reset successful [ +0.006184] BUG: kernel NULL pointer dereference, address: 0000000000000098 [ +0.000045] #PF: supervisor read access in kernel mode [ +0.000023] #PF: error_code(0x0000) - not-present page [ +0.000023] PGD 0 P4D 0 [ +0.000018] Oops: 0000 [#1] PREEMPT SMP NOPTI [ +0.000023] CPU: 38 PID: 7540 Comm: kworker/38:1 Not tainted 6.8.0-rc7 #1 [ +0.000031] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0014.082620210524 08/26/2021 [ +0.000036] Workqueue: ice ice_service_task [ice] [ +0.000183] RIP: 0010:ice_clean_tx_ring+0xa/0xd0 [ice] [...] [ +0.000013] Call Trace: [ +0.000016] [ +0.000014] ? __die+0x1f/0x70 [ +0.000029] ? page_fault_oops+0x171/0x4f0 [ +0.000029] ? schedule+0x3b/0xd0 [ +0.000027] ? exc_page_fault+0x7b/0x180 [ +0.000022] ? asm_exc_page_fault+0x22/0x30 [ +0.000031] ? ice_clean_tx_ring+0xa/0xd0 [ice] [ +0.000194] ice_free_tx_ring+0xe/0x60 [ice] [ +0.000186] ice_destroy_xdp_rings+0x157/0x310 [ice] [ +0.000151] ice_vsi_decfg+0x53/0xe0 [ice] [ +0.000180] ice_vsi_rebuild+0x239/0x540 [ice] [ +0.000186] ice_vsi_rebuild_by_type+0x76/0x180 [ice] [ +0.000145] ice_rebuild+0x18c/0x840 [ice] [ +0.000145] ? delay_tsc+0x4a/0xc0 [ +0.000022] ? delay_tsc+0x92/0xc0 [ +0.000020] ice_do_reset+0x140/0x180 [ice] [ +0.000886] ice_service_task+0x404/0x1030 [ice] [ +0.000824] process_one_work+0x171/0x340 [ +0.000685] worker_thread+0x277/0x3a0 [ +0.000675] ? preempt_count_add+0x6a/0xa0 [ +0.000677] ? _raw_spin_lock_irqsave+0x23/0x50 [ +0.000679] ? __pfx_worker_thread+0x10/0x10 [ +0.000653] kthread+0xf0/0x120 [ +0.000635] ? __pfx_kthread+0x10/0x10 [ +0.000616] ret_from_fork+0x2d/0x50 [ +0.000612] ? __pfx_kthread+0x10/0x10 [ +0.000604] ret_from_fork_asm+0x1b/0x30 [ +0.000604] The previous way of handling this through returning -EBUSY is not viable, particularly when destroying AF_XDP socket, because the kernel proceeds with removal anyway. There is plenty of code between those calls and there is no need to create a large critical section that covers all of them, same as there is no need to protect ice_vsi_rebuild() with rtnl_lock(). Add xdp_state_lock mutex to protect ice_vsi_rebuild() and ice_xdp(). Leaving unprotected sections in between would result in two states that have to be considered: 1. when the VSI is closed, but not yet rebuild 2. when VSI is already rebuild, but not yet open The latter case is actually already handled through !netif_running() case, we just need to adjust flag checking a little. The former one is not as trivial, because between ice_vsi_close() and ice_vsi_rebuild(), a lot of hardware interaction happens, this can make adding/deleting rings exit with an error. Luckily, VSI rebuild is pending and can apply new configuration for us in a managed fashion. Therefore, add an additional VSI state flag ICE_VSI_REBUILD_PENDING to indicate that ice_xdp() can just hot-swap the program. Also, as ice_vsi_rebuild() flow is touched in this patch, make it more consistent by deconfiguring VSI when coalesce allocation fails. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Fixes: efc2214b6047 ("ice: Add support for XDP") Reviewed-by: Wojciech Drewek Reviewed-by: Jacob Keller Tested-by: Chandan Kumar Rout Signed-off-by: Larysa Zaremba Reviewed-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 2 ++ drivers/net/ethernet/intel/ice/ice_lib.c | 34 ++++++++++++++--------- drivers/net/ethernet/intel/ice/ice_main.c | 19 +++++++++---- drivers/net/ethernet/intel/ice/ice_xsk.c | 3 +- 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index caaa10157909..ce8b5505b16d 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -318,6 +318,7 @@ enum ice_vsi_state { ICE_VSI_UMAC_FLTR_CHANGED, ICE_VSI_MMAC_FLTR_CHANGED, ICE_VSI_PROMISC_CHANGED, + ICE_VSI_REBUILD_PENDING, ICE_VSI_STATE_NBITS /* must be last */ }; @@ -411,6 +412,7 @@ struct ice_vsi { struct ice_tx_ring **xdp_rings; /* XDP ring array */ u16 num_xdp_txq; /* Used XDP queues */ u8 xdp_mapping_mode; /* ICE_MAP_MODE_[CONTIG|SCATTER] */ + struct mutex xdp_state_lock; struct net_device **target_netdevs; diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 6676596df88b..c1c1b63d9701 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -447,6 +447,7 @@ static void ice_vsi_free(struct ice_vsi *vsi) ice_vsi_free_stats(vsi); ice_vsi_free_arrays(vsi); + mutex_destroy(&vsi->xdp_state_lock); mutex_unlock(&pf->sw_mutex); devm_kfree(dev, vsi); } @@ -626,6 +627,8 @@ static struct ice_vsi *ice_vsi_alloc(struct ice_pf *pf) pf->next_vsi = ice_get_free_slot(pf->vsi, pf->num_alloc_vsi, pf->next_vsi); + mutex_init(&vsi->xdp_state_lock); + unlock_pf: mutex_unlock(&pf->sw_mutex); return vsi; @@ -2972,19 +2975,23 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) if (WARN_ON(vsi->type == ICE_VSI_VF && !vsi->vf)) return -EINVAL; + mutex_lock(&vsi->xdp_state_lock); + ret = ice_vsi_realloc_stat_arrays(vsi); if (ret) - goto err_vsi_cfg; + goto unlock; ice_vsi_decfg(vsi); ret = ice_vsi_cfg_def(vsi); if (ret) - goto err_vsi_cfg; + goto unlock; coalesce = kcalloc(vsi->num_q_vectors, sizeof(struct ice_coalesce_stored), GFP_KERNEL); - if (!coalesce) - return -ENOMEM; + if (!coalesce) { + ret = -ENOMEM; + goto decfg; + } prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce); @@ -2992,22 +2999,23 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) if (ret) { if (vsi_flags & ICE_VSI_FLAG_INIT) { ret = -EIO; - goto err_vsi_cfg_tc_lan; + goto free_coalesce; } - kfree(coalesce); - return ice_schedule_reset(pf, ICE_RESET_PFR); + ret = ice_schedule_reset(pf, ICE_RESET_PFR); + goto free_coalesce; } ice_vsi_rebuild_set_coalesce(vsi, coalesce, prev_num_q_vectors); - kfree(coalesce); + clear_bit(ICE_VSI_REBUILD_PENDING, vsi->state); - return 0; - -err_vsi_cfg_tc_lan: - ice_vsi_decfg(vsi); +free_coalesce: kfree(coalesce); -err_vsi_cfg: +decfg: + if (ret) + ice_vsi_decfg(vsi); +unlock: + mutex_unlock(&vsi->xdp_state_lock); return ret; } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 263833346d3a..4edaddcba3b4 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -616,6 +616,7 @@ skip: /* clear SW filtering DB */ ice_clear_hw_tbls(hw); /* disable the VSIs and their queues that are not already DOWN */ + set_bit(ICE_VSI_REBUILD_PENDING, ice_get_main_vsi(pf)->state); ice_pf_dis_all_vsi(pf, false); if (test_bit(ICE_FLAG_PTP_SUPPORTED, pf->flags)) @@ -3016,7 +3017,8 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, } /* hot swap progs and avoid toggling link */ - if (ice_is_xdp_ena_vsi(vsi) == !!prog) { + if (ice_is_xdp_ena_vsi(vsi) == !!prog || + test_bit(ICE_VSI_REBUILD_PENDING, vsi->state)) { ice_vsi_assign_bpf_prog(vsi, prog); return 0; } @@ -3088,21 +3090,28 @@ static int ice_xdp(struct net_device *dev, struct netdev_bpf *xdp) { struct ice_netdev_priv *np = netdev_priv(dev); struct ice_vsi *vsi = np->vsi; + int ret; if (vsi->type != ICE_VSI_PF) { NL_SET_ERR_MSG_MOD(xdp->extack, "XDP can be loaded only on PF VSI"); return -EINVAL; } + mutex_lock(&vsi->xdp_state_lock); + switch (xdp->command) { case XDP_SETUP_PROG: - return ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack); + ret = ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack); + break; case XDP_SETUP_XSK_POOL: - return ice_xsk_pool_setup(vsi, xdp->xsk.pool, - xdp->xsk.queue_id); + ret = ice_xsk_pool_setup(vsi, xdp->xsk.pool, xdp->xsk.queue_id); + break; default: - return -EINVAL; + ret = -EINVAL; } + + mutex_unlock(&vsi->xdp_state_lock); + return ret; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 240a7bec242b..a659951fa987 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -390,7 +390,8 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid) goto failure; } - if_running = netif_running(vsi->netdev) && ice_is_xdp_ena_vsi(vsi); + if_running = !test_bit(ICE_VSI_DOWN, vsi->state) && + ice_is_xdp_ena_vsi(vsi); if (if_running) { struct ice_rx_ring *rx_ring = vsi->rx_rings[qid]; From f50c68763436bc8f805712a7c5ceaf58cfcf5f07 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 23 Aug 2024 11:59:28 +0200 Subject: [PATCH 40/61] ice: check for XDP rings instead of bpf program when unconfiguring If VSI rebuild is pending, .ndo_bpf() can attach/detach the XDP program on VSI without applying new ring configuration. When unconfiguring the VSI, we can encounter the state in which there is an XDP program but no XDP rings to destroy or there will be XDP rings that need to be destroyed, but no XDP program to indicate their presence. When unconfiguring, rely on the presence of XDP rings rather then XDP program, as they better represent the current state that has to be destroyed. Reviewed-by: Wojciech Drewek Reviewed-by: Jacob Keller Tested-by: Chandan Kumar Rout Acked-by: Maciej Fijalkowski Signed-off-by: Larysa Zaremba Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lib.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_main.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_xsk.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index c1c1b63d9701..3dccfaba024c 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2426,7 +2426,7 @@ void ice_vsi_decfg(struct ice_vsi *vsi) dev_err(ice_pf_to_dev(pf), "Failed to remove RDMA scheduler config for VSI %u, err %d\n", vsi->vsi_num, err); - if (ice_is_xdp_ena_vsi(vsi)) + if (vsi->xdp_rings) /* return value check can be skipped here, it always returns * 0 if reset is in progress */ @@ -2528,7 +2528,7 @@ static void ice_vsi_release_msix(struct ice_vsi *vsi) for (q = 0; q < q_vector->num_ring_tx; q++) { ice_write_itr(&q_vector->tx, 0); wr32(hw, QINT_TQCTL(vsi->txq_map[txq]), 0); - if (ice_is_xdp_ena_vsi(vsi)) { + if (vsi->xdp_rings) { u32 xdp_txq = txq + vsi->num_xdp_txq; wr32(hw, QINT_TQCTL(vsi->txq_map[xdp_txq]), 0); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 4edaddcba3b4..22b8ef5faf8d 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -7249,7 +7249,7 @@ int ice_down(struct ice_vsi *vsi) if (tx_err) netdev_err(vsi->netdev, "Failed stop Tx rings, VSI %d error %d\n", vsi->vsi_num, tx_err); - if (!tx_err && ice_is_xdp_ena_vsi(vsi)) { + if (!tx_err && vsi->xdp_rings) { tx_err = ice_vsi_stop_xdp_tx_rings(vsi); if (tx_err) netdev_err(vsi->netdev, "Failed stop XDP rings, VSI %d error %d\n", @@ -7266,7 +7266,7 @@ int ice_down(struct ice_vsi *vsi) ice_for_each_txq(vsi, i) ice_clean_tx_ring(vsi->tx_rings[i]); - if (ice_is_xdp_ena_vsi(vsi)) + if (vsi->xdp_rings) ice_for_each_xdp_txq(vsi, i) ice_clean_tx_ring(vsi->xdp_rings[i]); diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index a659951fa987..8693509efbe7 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -39,7 +39,7 @@ static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx) sizeof(vsi_stat->rx_ring_stats[q_idx]->rx_stats)); memset(&vsi_stat->tx_ring_stats[q_idx]->stats, 0, sizeof(vsi_stat->tx_ring_stats[q_idx]->stats)); - if (ice_is_xdp_ena_vsi(vsi)) + if (vsi->xdp_rings) memset(&vsi->xdp_rings[q_idx]->ring_stats->stats, 0, sizeof(vsi->xdp_rings[q_idx]->ring_stats->stats)); } @@ -52,7 +52,7 @@ static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx) static void ice_qp_clean_rings(struct ice_vsi *vsi, u16 q_idx) { ice_clean_tx_ring(vsi->tx_rings[q_idx]); - if (ice_is_xdp_ena_vsi(vsi)) + if (vsi->xdp_rings) ice_clean_tx_ring(vsi->xdp_rings[q_idx]); ice_clean_rx_ring(vsi->rx_rings[q_idx]); } @@ -194,7 +194,7 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta); if (!fail) fail = err; - if (ice_is_xdp_ena_vsi(vsi)) { + if (vsi->xdp_rings) { struct ice_tx_ring *xdp_ring = vsi->xdp_rings[q_idx]; memset(&txq_meta, 0, sizeof(txq_meta)); From d8c40b9d3a6cef61eb5a0c58c34a3090ea938d89 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 23 Aug 2024 11:59:29 +0200 Subject: [PATCH 41/61] ice: check ICE_VSI_DOWN under rtnl_lock when preparing for reset Consider the following scenario: .ndo_bpf() | ice_prepare_for_reset() | ________________________|_______________________________________| rtnl_lock() | | ice_down() | | | test_bit(ICE_VSI_DOWN) - true | | ice_dis_vsi() returns | ice_up() | | | proceeds to rebuild a running VSI | .ndo_bpf() is not the only rtnl-locked callback that toggles the interface to apply new configuration. Another example is .set_channels(). To avoid the race condition above, act only after reading ICE_VSI_DOWN under rtnl_lock. Fixes: 0f9d5027a749 ("ice: Refactor VSI allocation, deletion and rebuild flow") Reviewed-by: Wojciech Drewek Reviewed-by: Jacob Keller Tested-by: Chandan Kumar Rout Signed-off-by: Larysa Zaremba Reviewed-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lib.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 3dccfaba024c..737c00b02dd0 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2672,8 +2672,7 @@ int ice_ena_vsi(struct ice_vsi *vsi, bool locked) */ void ice_dis_vsi(struct ice_vsi *vsi, bool locked) { - if (test_bit(ICE_VSI_DOWN, vsi->state)) - return; + bool already_down = test_bit(ICE_VSI_DOWN, vsi->state); set_bit(ICE_VSI_NEEDS_RESTART, vsi->state); @@ -2681,15 +2680,16 @@ void ice_dis_vsi(struct ice_vsi *vsi, bool locked) if (netif_running(vsi->netdev)) { if (!locked) rtnl_lock(); - - ice_vsi_close(vsi); + already_down = test_bit(ICE_VSI_DOWN, vsi->state); + if (!already_down) + ice_vsi_close(vsi); if (!locked) rtnl_unlock(); - } else { + } else if (!already_down) { ice_vsi_close(vsi); } - } else if (vsi->type == ICE_VSI_CTRL) { + } else if (vsi->type == ICE_VSI_CTRL && !already_down) { ice_vsi_close(vsi); } } From 7e3b407ccbea3259b8583ccc34807622025e390f Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 23 Aug 2024 11:59:30 +0200 Subject: [PATCH 42/61] ice: remove ICE_CFG_BUSY locking from AF_XDP code Locking used in ice_qp_ena() and ice_qp_dis() does pretty much nothing, because ICE_CFG_BUSY is a state flag that is supposed to be set in a PF state, not VSI one. Therefore it does not protect the queue pair from e.g. reset. Remove ICE_CFG_BUSY locking from ice_qp_dis() and ice_qp_ena(). Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Wojciech Drewek Reviewed-by: Jacob Keller Tested-by: Chandan Kumar Rout Reviewed-by: Maciej Fijalkowski Signed-off-by: Larysa Zaremba Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 8693509efbe7..5dee829bfc47 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -165,7 +165,6 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) struct ice_q_vector *q_vector; struct ice_tx_ring *tx_ring; struct ice_rx_ring *rx_ring; - int timeout = 50; int fail = 0; int err; @@ -176,13 +175,6 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) rx_ring = vsi->rx_rings[q_idx]; q_vector = rx_ring->q_vector; - while (test_and_set_bit(ICE_CFG_BUSY, vsi->state)) { - timeout--; - if (!timeout) - return -EBUSY; - usleep_range(1000, 2000); - } - synchronize_net(); netif_carrier_off(vsi->netdev); netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); @@ -261,7 +253,6 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); netif_carrier_on(vsi->netdev); } - clear_bit(ICE_CFG_BUSY, vsi->state); return fail; } From 04c7e14e5b0b6227e7b00d7a96ca2f2426ab9171 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 23 Aug 2024 11:59:31 +0200 Subject: [PATCH 43/61] ice: do not bring the VSI up, if it was down before the XDP setup After XDP configuration is completed, we bring the interface up unconditionally, regardless of its state before the call to .ndo_bpf(). Preserve the information whether the interface had to be brought down and later bring it up only in such case. Fixes: efc2214b6047 ("ice: Add support for XDP") Reviewed-by: Wojciech Drewek Reviewed-by: Jacob Keller Tested-by: Chandan Kumar Rout Acked-by: Maciej Fijalkowski Signed-off-by: Larysa Zaremba Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 22b8ef5faf8d..c7db88b517da 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3005,8 +3005,8 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, struct netlink_ext_ack *extack) { unsigned int frame_size = vsi->netdev->mtu + ICE_ETH_PKT_HDR_PAD; - bool if_running = netif_running(vsi->netdev); int ret = 0, xdp_ring_err = 0; + bool if_running; if (prog && !prog->aux->xdp_has_frags) { if (frame_size > ice_max_xdp_frame_size(vsi)) { @@ -3023,8 +3023,11 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, return 0; } + if_running = netif_running(vsi->netdev) && + !test_and_set_bit(ICE_VSI_DOWN, vsi->state); + /* need to stop netdev while setting up the program for Rx rings */ - if (if_running && !test_and_set_bit(ICE_VSI_DOWN, vsi->state)) { + if (if_running) { ret = ice_down(vsi); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Preparing device for XDP attach failed"); From 5872b47ce18efad5862b74ad334cbdfffa7f8a0c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 2 Sep 2024 10:09:37 +0100 Subject: [PATCH 44/61] MAINTAINERS: wifi: cw1200: add net-cw1200.h This is part of an effort [1] to assign a section in MAINTAINERS to header files that relate to Networking. In this case the files with "net" in their name. [1] https://lore.kernel.org/netdev/20240821-net-mnt-v2-0-59a5af38e69d@kernel.org/ It seems that net-cw1200.h is part of the CW1200 WLAN driver and this it is appropriate to add it to the section for that driver. Signed-off-by: Simon Horman Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240902-wifi-mnt-v2-1-f5ad1f36e993@kernel.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8766f3e5e87e..d705e22f1d28 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5951,6 +5951,7 @@ F: Documentation/process/cve.rst CW1200 WLAN driver S: Orphan F: drivers/net/wireless/st/cw1200/ +F: include/linux/platform_data/net-cw1200.h CX18 VIDEO4LINUX DRIVER M: Andy Walls From bab8eb0dd4cb995caa4a0529d5655531c2ec5e8e Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 29 Aug 2024 19:50:55 +0200 Subject: [PATCH 45/61] usbnet: modern method to get random MAC The driver generates a random MAC once on load and uses it over and over, including on two devices needing a random MAC at the same time. Jakub suggested revamping the driver to the modern API for setting a random MAC rather than fixing the old stuff. The bug is as old as the driver. Signed-off-by: Oliver Neukum Reviewed-by: Simon Horman Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://patch.msgid.link/20240829175201.670718-1-oneukum@suse.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/usbnet.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 9fd516e8bb10..18eb5ba436df 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -61,9 +61,6 @@ /*-------------------------------------------------------------------------*/ -// randomly generated ethernet address -static u8 node_id [ETH_ALEN]; - /* use ethtool to change the level for any given device */ static int msg_level = -1; module_param (msg_level, int, 0); @@ -1725,7 +1722,6 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) dev->net = net; strscpy(net->name, "usb%d", sizeof(net->name)); - eth_hw_addr_set(net, node_id); /* rx and tx sides can use different message sizes; * bind() should set rx_urb_size in that case. @@ -1801,9 +1797,9 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) goto out4; } - /* let userspace know we have a random address */ - if (ether_addr_equal(net->dev_addr, node_id)) - net->addr_assign_type = NET_ADDR_RANDOM; + /* this flags the device for user space */ + if (!is_valid_ether_addr(net->dev_addr)) + eth_hw_addr_random(net); if ((dev->driver_info->flags & FLAG_WLAN) != 0) SET_NETDEV_DEVTYPE(net, &wlan_type); @@ -2211,7 +2207,6 @@ static int __init usbnet_init(void) BUILD_BUG_ON( sizeof_field(struct sk_buff, cb) < sizeof(struct skb_data)); - eth_random_addr(node_id); return 0; } module_init(usbnet_init); From 3b3a2a9c6349e25a025d2330f479bc33a6ccb54a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 1 Sep 2024 11:16:07 -0700 Subject: [PATCH 46/61] sch/netem: fix use after free in netem_dequeue If netem_dequeue() enqueues packet to inner qdisc and that qdisc returns __NET_XMIT_STOLEN. The packet is dropped but qdisc_tree_reduce_backlog() is not called to update the parent's q.qlen, leading to the similar use-after-free as Commit e04991a48dbaf382 ("netem: fix return value if duplicate enqueue fails") Commands to trigger KASAN UaF: ip link add type dummy ip link set lo up ip link set dummy0 up tc qdisc add dev lo parent root handle 1: drr tc filter add dev lo parent 1: basic classid 1:1 tc class add dev lo classid 1:1 drr tc qdisc add dev lo parent 1:1 handle 2: netem tc qdisc add dev lo parent 2: handle 3: drr tc filter add dev lo parent 3: basic classid 3:1 action mirred egress redirect dev dummy0 tc class add dev lo classid 3:1 drr ping -c1 -W0.01 localhost # Trigger bug tc class del dev lo classid 1:1 tc class add dev lo classid 1:1 drr ping -c1 -W0.01 localhost # UaF Fixes: 50612537e9ab ("netem: fix classful handling") Reported-by: Budimir Markovic Signed-off-by: Stephen Hemminger Link: https://patch.msgid.link/20240901182438.4992-1-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 0f8d581438c3..39382ee1e331 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -742,11 +742,10 @@ deliver: err = qdisc_enqueue(skb, q->qdisc, &to_free); kfree_skb_list(to_free); - if (err != NET_XMIT_SUCCESS && - net_xmit_drop_count(err)) { - qdisc_qstats_drop(sch); - qdisc_tree_reduce_backlog(sch, 1, - pkt_len); + if (err != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(err)) + qdisc_qstats_drop(sch); + qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } From 77461c10819103eaee7b33c744174b32a8c78b40 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 2 Sep 2024 03:17:30 -0700 Subject: [PATCH 47/61] net: dqs: Do not use extern for unused dql_group When CONFIG_DQL is not enabled, dql_group should be treated as a dead declaration. However, its current extern declaration assumes the linker will ignore it, which is generally true across most compiler and architecture combinations. But in certain cases, the linker still attempts to resolve the extern struct, even when the associated code is dead, resulting in a linking error. For instance the following error in loongarch64: >> loongarch64-linux-ld: net-sysfs.c:(.text+0x589c): undefined reference to `dql_group' Modify the declaration of the dead object to be an empty declaration instead of an extern. This change will prevent the linker from attempting to resolve an undefined reference. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409012047.eCaOdfQJ-lkp@intel.com/ Fixes: 74293ea1c4db ("net: sysfs: Do not create sysfs for non BQL device") Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Tested-by: Simon Horman # build-tested Link: https://patch.msgid.link/20240902101734.3260455-1-leitao@debian.org Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 444f23e74f8e..291fdf4a328b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1524,7 +1524,7 @@ static const struct attribute_group dql_group = { }; #else /* Fake declaration, all the code using it should be dead */ -extern const struct attribute_group dql_group; +static const struct attribute_group dql_group = {}; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS From 33f339a1ba54e56bba57ee9a77c71e385ab4825c Mon Sep 17 00:00:00 2001 From: Tze-nan Wu Date: Fri, 30 Aug 2024 16:25:17 +0800 Subject: [PATCH 48/61] bpf, net: Fix a potential race in do_sock_getsockopt() There's a potential race when `cgroup_bpf_enabled(CGROUP_GETSOCKOPT)` is false during the execution of `BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN`, but becomes true when `BPF_CGROUP_RUN_PROG_GETSOCKOPT` is called. This inconsistency can lead to `BPF_CGROUP_RUN_PROG_GETSOCKOPT` receiving an "-EFAULT" from `__cgroup_bpf_run_filter_getsockopt(max_optlen=0)`. Scenario shown as below: `process A` `process B` ----------- ------------ BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN enable CGROUP_GETSOCKOPT BPF_CGROUP_RUN_PROG_GETSOCKOPT (-EFAULT) To resolve this, remove the `BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN` macro and directly uses `copy_from_sockptr` to ensure that `max_optlen` is always set before `BPF_CGROUP_RUN_PROG_GETSOCKOPT` is invoked. Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Co-developed-by: Yanghui Li Signed-off-by: Yanghui Li Co-developed-by: Cheng-Jui Wang Signed-off-by: Cheng-Jui Wang Signed-off-by: Tze-nan Wu Acked-by: Stanislav Fomichev Acked-by: Alexei Starovoitov Link: https://patch.msgid.link/20240830082518.23243-1-Tze-nan.Wu@mediatek.com Signed-off-by: Jakub Kicinski --- include/linux/bpf-cgroup.h | 9 --------- net/socket.c | 4 ++-- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index fb3c3e7181e6..ce91d9b2acb9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -390,14 +390,6 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, __ret; \ }) -#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ - copy_from_sockptr(&__ret, optlen, sizeof(int)); \ - __ret; \ -}) - #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ max_optlen, retval) \ ({ \ @@ -518,7 +510,6 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) -#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ diff --git a/net/socket.c b/net/socket.c index fcbdd5bc47ac..0a2bd22ec105 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2362,7 +2362,7 @@ INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen) { - int max_optlen __maybe_unused; + int max_optlen __maybe_unused = 0; const struct proto_ops *ops; int err; @@ -2371,7 +2371,7 @@ int do_sock_getsockopt(struct socket *sock, bool compat, int level, return err; if (!compat) - max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + copy_from_sockptr(&max_optlen, optlen, sizeof(int)); ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET) { From b6ecc662037694488bfff7c9fd21c405df8411f2 Mon Sep 17 00:00:00 2001 From: Souradeep Chakrabarti Date: Mon, 2 Sep 2024 05:43:47 -0700 Subject: [PATCH 49/61] net: mana: Fix error handling in mana_create_txq/rxq's NAPI cleanup Currently napi_disable() gets called during rxq and txq cleanup, even before napi is enabled and hrtimer is initialized. It causes kernel panic. ? page_fault_oops+0x136/0x2b0 ? page_counter_cancel+0x2e/0x80 ? do_user_addr_fault+0x2f2/0x640 ? refill_obj_stock+0xc4/0x110 ? exc_page_fault+0x71/0x160 ? asm_exc_page_fault+0x27/0x30 ? __mmdrop+0x10/0x180 ? __mmdrop+0xec/0x180 ? hrtimer_active+0xd/0x50 hrtimer_try_to_cancel+0x2c/0xf0 hrtimer_cancel+0x15/0x30 napi_disable+0x65/0x90 mana_destroy_rxq+0x4c/0x2f0 mana_create_rxq.isra.0+0x56c/0x6d0 ? mana_uncfg_vport+0x50/0x50 mana_alloc_queues+0x21b/0x320 ? skb_dequeue+0x5f/0x80 Cc: stable@vger.kernel.org Fixes: e1b5683ff62e ("net: mana: Move NAPI from EQ to CQ") Signed-off-by: Souradeep Chakrabarti Reviewed-by: Haiyang Zhang Reviewed-by: Shradha Gupta Signed-off-by: David S. Miller --- drivers/net/ethernet/microsoft/mana/mana_en.c | 22 +++++++++++-------- include/net/mana/mana.h | 2 ++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 39f56973746d..3d151700f658 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1872,10 +1872,12 @@ static void mana_destroy_txq(struct mana_port_context *apc) for (i = 0; i < apc->num_queues; i++) { napi = &apc->tx_qp[i].tx_cq.napi; - napi_synchronize(napi); - napi_disable(napi); - netif_napi_del(napi); - + if (apc->tx_qp[i].txq.napi_initialized) { + napi_synchronize(napi); + napi_disable(napi); + netif_napi_del(napi); + apc->tx_qp[i].txq.napi_initialized = false; + } mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object); mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq); @@ -1931,6 +1933,7 @@ static int mana_create_txq(struct mana_port_context *apc, txq->ndev = net; txq->net_txq = netdev_get_tx_queue(net, i); txq->vp_offset = apc->tx_vp_offset; + txq->napi_initialized = false; skb_queue_head_init(&txq->pending_skbs); memset(&spec, 0, sizeof(spec)); @@ -1997,6 +2000,7 @@ static int mana_create_txq(struct mana_port_context *apc, netif_napi_add_tx(net, &cq->napi, mana_poll); napi_enable(&cq->napi); + txq->napi_initialized = true; mana_gd_ring_cq(cq->gdma_cq, SET_ARM_BIT); } @@ -2008,7 +2012,7 @@ out: } static void mana_destroy_rxq(struct mana_port_context *apc, - struct mana_rxq *rxq, bool validate_state) + struct mana_rxq *rxq, bool napi_initialized) { struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; @@ -2023,15 +2027,15 @@ static void mana_destroy_rxq(struct mana_port_context *apc, napi = &rxq->rx_cq.napi; - if (validate_state) + if (napi_initialized) { napi_synchronize(napi); - napi_disable(napi); + napi_disable(napi); + netif_napi_del(napi); + } xdp_rxq_info_unreg(&rxq->xdp_rxq); - netif_napi_del(napi); - mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); mana_deinit_cq(apc, &rxq->rx_cq); diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 7caa334f4888..b8a6c7504ee1 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -98,6 +98,8 @@ struct mana_txq { atomic_t pending_sends; + bool napi_initialized; + struct mana_stats_tx stats; }; From 4963d2343af81f493519f9c3ea9f2169eaa7353a Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 30 Aug 2024 17:31:07 +0200 Subject: [PATCH 50/61] bareudp: Fix device stats updates. Bareudp devices update their stats concurrently. Therefore they need proper atomic increments. Fixes: 571912c69f0e ("net: UDP tunnel encapsulation module for tunnelling different protocols like MPLS, IP, NSH etc.") Signed-off-by: Guillaume Nault Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/04b7b9d0b480158eb3ab4366ec80aa2ab7e41fcb.1725031794.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index d5c56ca91b77..7aca0544fb29 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -83,7 +83,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (skb_copy_bits(skb, BAREUDP_BASE_HLEN, &ipversion, sizeof(ipversion))) { - bareudp->dev->stats.rx_dropped++; + DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } ipversion >>= 4; @@ -93,7 +93,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) } else if (ipversion == 6 && bareudp->multi_proto_mode) { proto = htons(ETH_P_IPV6); } else { - bareudp->dev->stats.rx_dropped++; + DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } } else if (bareudp->ethertype == htons(ETH_P_MPLS_UC)) { @@ -107,7 +107,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) ipv4_is_multicast(tunnel_hdr->daddr)) { proto = htons(ETH_P_MPLS_MC); } else { - bareudp->dev->stats.rx_dropped++; + DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } } else { @@ -123,7 +123,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) (addr_type & IPV6_ADDR_MULTICAST)) { proto = htons(ETH_P_MPLS_MC); } else { - bareudp->dev->stats.rx_dropped++; + DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } } @@ -135,7 +135,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) proto, !net_eq(bareudp->net, dev_net(bareudp->dev)))) { - bareudp->dev->stats.rx_dropped++; + DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } @@ -143,7 +143,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) tun_dst = udp_tun_rx_dst(skb, family, key, 0, 0); if (!tun_dst) { - bareudp->dev->stats.rx_dropped++; + DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } skb_dst_set(skb, &tun_dst->dst); @@ -169,8 +169,8 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) &((struct ipv6hdr *)oiph)->saddr); } if (err > 1) { - ++bareudp->dev->stats.rx_frame_errors; - ++bareudp->dev->stats.rx_errors; + DEV_STATS_INC(bareudp->dev, rx_frame_errors); + DEV_STATS_INC(bareudp->dev, rx_errors); goto drop; } } @@ -467,11 +467,11 @@ tx_error: dev_kfree_skb(skb); if (err == -ELOOP) - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } From 7e4196935069947d8b70b09c1660b67b067e75cb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 2 Sep 2024 10:39:27 -0700 Subject: [PATCH 51/61] fou: Fix null-ptr-deref in GRO. We observed a null-ptr-deref in fou_gro_receive() while shutting down a host. [0] The NULL pointer is sk->sk_user_data, and the offset 8 is of protocol in struct fou. When fou_release() is called due to netns dismantle or explicit tunnel teardown, udp_tunnel_sock_release() sets NULL to sk->sk_user_data. Then, the tunnel socket is destroyed after a single RCU grace period. So, in-flight udp4_gro_receive() could find the socket and execute the FOU GRO handler, where sk->sk_user_data could be NULL. Let's use rcu_dereference_sk_user_data() in fou_from_sock() and add NULL checks in FOU GRO handlers. [0]: BUG: kernel NULL pointer dereference, address: 0000000000000008 PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 80000001032f4067 P4D 80000001032f4067 PUD 103240067 PMD 0 SMP PTI CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.10.216-204.855.amzn2.x86_64 #1 Hardware name: Amazon EC2 c5.large/, BIOS 1.0 10/16/2017 RIP: 0010:fou_gro_receive (net/ipv4/fou.c:233) [fou] Code: 41 5f c3 cc cc cc cc e8 e7 2e 69 f4 0f 1f 80 00 00 00 00 0f 1f 44 00 00 49 89 f8 41 54 48 89 f7 48 89 d6 49 8b 80 88 02 00 00 <0f> b6 48 08 0f b7 42 4a 66 25 fd fd 80 cc 02 66 89 42 4a 0f b6 42 RSP: 0018:ffffa330c0003d08 EFLAGS: 00010297 RAX: 0000000000000000 RBX: ffff93d9e3a6b900 RCX: 0000000000000010 RDX: ffff93d9e3a6b900 RSI: ffff93d9e3a6b900 RDI: ffff93dac2e24d08 RBP: ffff93d9e3a6b900 R08: ffff93dacbce6400 R09: 0000000000000002 R10: 0000000000000000 R11: ffffffffb5f369b0 R12: ffff93dacbce6400 R13: ffff93dac2e24d08 R14: 0000000000000000 R15: ffffffffb4edd1c0 FS: 0000000000000000(0000) GS:ffff93daee800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 0000000102140001 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? show_trace_log_lvl (arch/x86/kernel/dumpstack.c:259) ? __die_body.cold (arch/x86/kernel/dumpstack.c:478 arch/x86/kernel/dumpstack.c:420) ? no_context (arch/x86/mm/fault.c:752) ? exc_page_fault (arch/x86/include/asm/irqflags.h:49 arch/x86/include/asm/irqflags.h:89 arch/x86/mm/fault.c:1435 arch/x86/mm/fault.c:1483) ? asm_exc_page_fault (arch/x86/include/asm/idtentry.h:571) ? fou_gro_receive (net/ipv4/fou.c:233) [fou] udp_gro_receive (include/linux/netdevice.h:2552 net/ipv4/udp_offload.c:559) udp4_gro_receive (net/ipv4/udp_offload.c:604) inet_gro_receive (net/ipv4/af_inet.c:1549 (discriminator 7)) dev_gro_receive (net/core/dev.c:6035 (discriminator 4)) napi_gro_receive (net/core/dev.c:6170) ena_clean_rx_irq (drivers/amazon/net/ena/ena_netdev.c:1558) [ena] ena_io_poll (drivers/amazon/net/ena/ena_netdev.c:1742) [ena] napi_poll (net/core/dev.c:6847) net_rx_action (net/core/dev.c:6917) __do_softirq (arch/x86/include/asm/jump_label.h:25 include/linux/jump_label.h:200 include/trace/events/irq.h:142 kernel/softirq.c:299) asm_call_irq_on_stack (arch/x86/entry/entry_64.S:809) do_softirq_own_stack (arch/x86/include/asm/irq_stack.h:27 arch/x86/include/asm/irq_stack.h:77 arch/x86/kernel/irq_64.c:77) irq_exit_rcu (kernel/softirq.c:393 kernel/softirq.c:423 kernel/softirq.c:435) common_interrupt (arch/x86/kernel/irq.c:239) asm_common_interrupt (arch/x86/include/asm/idtentry.h:626) RIP: 0010:acpi_idle_do_entry (arch/x86/include/asm/irqflags.h:49 arch/x86/include/asm/irqflags.h:89 drivers/acpi/processor_idle.c:114 drivers/acpi/processor_idle.c:575) Code: 8b 15 d1 3c c4 02 ed c3 cc cc cc cc 65 48 8b 04 25 40 ef 01 00 48 8b 00 a8 08 75 eb 0f 1f 44 00 00 0f 00 2d d5 09 55 00 fb f4 c3 cc cc cc cc e9 be fc ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 RSP: 0018:ffffffffb5603e58 EFLAGS: 00000246 RAX: 0000000000004000 RBX: ffff93dac0929c00 RCX: ffff93daee833900 RDX: ffff93daee800000 RSI: ffff93daee87dc00 RDI: ffff93daee87dc64 RBP: 0000000000000001 R08: ffffffffb5e7b6c0 R09: 0000000000000044 R10: ffff93daee831b04 R11: 00000000000001cd R12: 0000000000000001 R13: ffffffffb5e7b740 R14: 0000000000000001 R15: 0000000000000000 ? sched_clock_cpu (kernel/sched/clock.c:371) acpi_idle_enter (drivers/acpi/processor_idle.c:712 (discriminator 3)) cpuidle_enter_state (drivers/cpuidle/cpuidle.c:237) cpuidle_enter (drivers/cpuidle/cpuidle.c:353) cpuidle_idle_call (kernel/sched/idle.c:158 kernel/sched/idle.c:239) do_idle (kernel/sched/idle.c:302) cpu_startup_entry (kernel/sched/idle.c:395 (discriminator 1)) start_kernel (init/main.c:1048) secondary_startup_64_no_verify (arch/x86/kernel/head_64.S:310) Modules linked in: udp_diag tcp_diag inet_diag nft_nat ipip tunnel4 dummy fou ip_tunnel nft_masq nft_chain_nat nf_nat wireguard nft_ct curve25519_x86_64 libcurve25519_generic nf_conntrack libchacha20poly1305 nf_defrag_ipv6 nf_defrag_ipv4 nft_objref chacha_x86_64 nft_counter nf_tables nfnetlink poly1305_x86_64 ip6_udp_tunnel udp_tunnel libchacha crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd cryptd glue_helper mousedev psmouse button ena ptp pps_core crc32c_intel CR2: 0000000000000008 Fixes: d92283e338f6 ("fou: change to use UDP socket GRO") Reported-by: Alphonse Kurian Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240902173927.62706-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fou_core.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 0abbc413e0fe..78b869b31492 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -50,7 +50,7 @@ struct fou_net { static inline struct fou *fou_from_sock(struct sock *sk) { - return sk->sk_user_data; + return rcu_dereference_sk_user_data(sk); } static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len) @@ -233,9 +233,15 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, struct sk_buff *skb) { const struct net_offload __rcu **offloads; - u8 proto = fou_from_sock(sk)->protocol; + struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; struct sk_buff *pp = NULL; + u8 proto; + + if (!fou) + goto out; + + proto = fou->protocol; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel @@ -263,14 +269,24 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { const struct net_offload __rcu **offloads; - u8 proto = fou_from_sock(sk)->protocol; + struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; - int err = -ENOSYS; + u8 proto; + int err; + + if (!fou) { + err = -ENOENT; + goto out; + } + + proto = fou->protocol; offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); - if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) { + err = -ENOSYS; goto out; + } err = ops->callbacks.gro_complete(skb, nhoff); @@ -320,6 +336,9 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct gro_remcsum grc; u8 proto; + if (!fou) + goto out; + skb_gro_remcsum_init(&grc); off = skb_gro_offset(skb); From 8487b4af59d4d7feda4b119dc2d92c67ca25c27e Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Tue, 3 Sep 2024 14:33:33 +0800 Subject: [PATCH 52/61] r8152: fix the firmware doesn't work generic_ocp_write() asks the parameter "size" must be 4 bytes align. Therefore, write the bp would fail, if the mac->bp_num is odd. Align the size to 4 for fixing it. The way may write an extra bp, but the rtl8152_is_fw_mac_ok() makes sure the value must be 0 for the bp whose index is more than mac->bp_num. That is, there is no influence for the firmware. Besides, I check the return value of generic_ocp_write() to make sure everything is correct. Fixes: e5c266a61186 ("r8152: set bp in bulk") Signed-off-by: Hayes Wang Link: https://patch.msgid.link/20240903063333.4502-1-hayeswang@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 15e12f46d0ea..a5612c799f5e 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -5178,14 +5178,23 @@ static void rtl8152_fw_mac_apply(struct r8152 *tp, struct fw_mac *mac) data = (u8 *)mac; data += __le16_to_cpu(mac->fw_offset); - generic_ocp_write(tp, __le16_to_cpu(mac->fw_reg), 0xff, length, data, - type); + if (generic_ocp_write(tp, __le16_to_cpu(mac->fw_reg), 0xff, length, + data, type) < 0) { + dev_err(&tp->intf->dev, "Write %s fw fail\n", + type ? "PLA" : "USB"); + return; + } ocp_write_word(tp, type, __le16_to_cpu(mac->bp_ba_addr), __le16_to_cpu(mac->bp_ba_value)); - generic_ocp_write(tp, __le16_to_cpu(mac->bp_start), BYTE_EN_DWORD, - __le16_to_cpu(mac->bp_num) << 1, mac->bp, type); + if (generic_ocp_write(tp, __le16_to_cpu(mac->bp_start), BYTE_EN_DWORD, + ALIGN(__le16_to_cpu(mac->bp_num) << 1, 4), + mac->bp, type) < 0) { + dev_err(&tp->intf->dev, "Write %s bp fail\n", + type ? "PLA" : "USB"); + return; + } bp_en_addr = __le16_to_cpu(mac->bp_en_addr); if (bp_en_addr) From bee2ef946d3184e99077be526567d791c473036f Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 3 Sep 2024 10:19:57 +0200 Subject: [PATCH 53/61] net: bridge: br_fdb_external_learn_add(): always set EXT_LEARN When userspace wants to take over a fdb entry by setting it as EXTERN_LEARNED, we set both flags BR_FDB_ADDED_BY_EXT_LEARN and BR_FDB_ADDED_BY_USER in br_fdb_external_learn_add(). If the bridge updates the entry later because its port changed, we clear the BR_FDB_ADDED_BY_EXT_LEARN flag, but leave the BR_FDB_ADDED_BY_USER flag set. If userspace then wants to take over the entry again, br_fdb_external_learn_add() sees that BR_FDB_ADDED_BY_USER and skips setting the BR_FDB_ADDED_BY_EXT_LEARN flags, thus silently ignores the update. Fix this by always allowing to set BR_FDB_ADDED_BY_EXT_LEARN regardless if this was a user fdb entry or not. Fixes: 710ae7287737 ("net: bridge: Mark FDB entries that were added by user as such") Signed-off-by: Jonas Gorski Acked-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20240903081958.29951-1-jonas.gorski@bisdn.de Signed-off-by: Jakub Kicinski --- net/bridge/br_fdb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index c77591e63841..ad7a42b505ef 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1469,12 +1469,10 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, modified = true; } - if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { + if (test_and_set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ fdb->used = jiffies; - } else if (!test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags)) { - /* Take over SW learned entry */ - set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); + } else { modified = true; } From 858430db28a5f5a11f8faa3a6fa805438e6f0851 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 3 Sep 2024 13:51:41 -0400 Subject: [PATCH 54/61] net: xilinx: axienet: Fix race in axienet_stop axienet_dma_err_handler can race with axienet_stop in the following manner: CPU 1 CPU 2 ====================== ================== axienet_stop() napi_disable() axienet_dma_stop() axienet_dma_err_handler() napi_disable() axienet_dma_stop() axienet_dma_start() napi_enable() cancel_work_sync() free_irq() Fix this by setting a flag in axienet_stop telling axienet_dma_err_handler not to bother doing anything. I chose not to use disable_work_sync to allow for easier backporting. Signed-off-by: Sean Anderson Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Link: https://patch.msgid.link/20240903175141.4132898-1-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet.h | 3 +++ drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index 09c9f9787180..1223fcc1a8da 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -436,6 +436,8 @@ struct skbuf_dma_descriptor { * @tx_bytes: TX byte count for statistics * @tx_stat_sync: Synchronization object for TX stats * @dma_err_task: Work structure to process Axi DMA errors + * @stopping: Set when @dma_err_task shouldn't do anything because we are + * about to stop the device. * @tx_irq: Axidma TX IRQ number * @rx_irq: Axidma RX IRQ number * @eth_irq: Ethernet core IRQ number @@ -507,6 +509,7 @@ struct axienet_local { struct u64_stats_sync tx_stat_sync; struct work_struct dma_err_task; + bool stopping; int tx_irq; int rx_irq; diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 9aeb7b9f3ae4..9eb300fc3590 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1460,6 +1460,7 @@ static int axienet_init_legacy_dma(struct net_device *ndev) struct axienet_local *lp = netdev_priv(ndev); /* Enable worker thread for Axi DMA error handling */ + lp->stopping = false; INIT_WORK(&lp->dma_err_task, axienet_dma_err_handler); napi_enable(&lp->napi_rx); @@ -1580,6 +1581,9 @@ static int axienet_stop(struct net_device *ndev) dev_dbg(&ndev->dev, "axienet_close()\n"); if (!lp->use_dmaengine) { + WRITE_ONCE(lp->stopping, true); + flush_work(&lp->dma_err_task); + napi_disable(&lp->napi_tx); napi_disable(&lp->napi_rx); } @@ -2154,6 +2158,10 @@ static void axienet_dma_err_handler(struct work_struct *work) dma_err_task); struct net_device *ndev = lp->ndev; + /* Don't bother if we are going to stop anyway */ + if (READ_ONCE(lp->stopping)) + return; + napi_disable(&lp->napi_tx); napi_disable(&lp->napi_rx); From c82299fbbccecf5866bdc3fa9cc46d5c6f5005ad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 30 Aug 2024 10:14:42 -0700 Subject: [PATCH 55/61] docs: netdev: document guidance on cleanup.h Document what was discussed multiple times on list and various virtual / in-person conversations. guard() being okay in functions <= 20 LoC is a bit of my own invention. If the function is trivial it should be fine, but feel free to disagree :) We'll obviously revisit this guidance as time passes and we and other subsystems get more experience. Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20240830171443.3532077-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/process/maintainer-netdev.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index 30d24eecdaaa..c9edf9e7362d 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -375,6 +375,22 @@ When working in existing code which uses nonstandard formatting make your code follow the most recent guidelines, so that eventually all code in the domain of netdev is in the preferred format. +Using device-managed and cleanup.h constructs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Netdev remains skeptical about promises of all "auto-cleanup" APIs, +including even ``devm_`` helpers, historically. They are not the preferred +style of implementation, merely an acceptable one. + +Use of ``guard()`` is discouraged within any function longer than 20 lines, +``scoped_guard()`` is considered more readable. Using normal lock/unlock is +still (weakly) preferred. + +Low level cleanup constructs (such as ``__free()``) can be used when building +APIs and helpers, especially scoped iterators. However, direct use of +``__free()`` within networking core and drivers is discouraged. +Similar guidance applies to declaring variables mid-function. + Resending after review ~~~~~~~~~~~~~~~~~~~~~~ From 546ea84d07e3e324644025e2aae2d12ea4c5896e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 3 Sep 2024 18:08:45 +0200 Subject: [PATCH 56/61] sched: sch_cake: fix bulk flow accounting logic for host fairness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In sch_cake, we keep track of the count of active bulk flows per host, when running in dst/src host fairness mode, which is used as the round-robin weight when iterating through flows. The count of active bulk flows is updated whenever a flow changes state. This has a peculiar interaction with the hash collision handling: when a hash collision occurs (after the set-associative hashing), the state of the hash bucket is simply updated to match the new packet that collided, and if host fairness is enabled, that also means assigning new per-host state to the flow. For this reason, the bulk flow counters of the host(s) assigned to the flow are decremented, before new state is assigned (and the counters, which may not belong to the same host anymore, are incremented again). Back when this code was introduced, the host fairness mode was always enabled, so the decrement was unconditional. When the configuration flags were introduced the *increment* was made conditional, but the *decrement* was not. Which of course can lead to a spurious decrement (and associated wrap-around to U16_MAX). AFAICT, when host fairness is disabled, the decrement and wrap-around happens as soon as a hash collision occurs (which is not that common in itself, due to the set-associative hashing). However, in most cases this is harmless, as the value is only used when host fairness mode is enabled. So in order to trigger an array overflow, sch_cake has to first be configured with host fairness disabled, and while running in this mode, a hash collision has to occur to cause the overflow. Then, the qdisc has to be reconfigured to enable host fairness, which leads to the array out-of-bounds because the wrapped-around value is retained and used as an array index. It seems that syzbot managed to trigger this, which is quite impressive in its own right. This patch fixes the issue by introducing the same conditional check on decrement as is used on increment. The original bug predates the upstreaming of cake, but the commit listed in the Fixes tag touched that code, meaning that this patch won't apply before that. Fixes: 712639929912 ("sch_cake: Make the dual modes fairer") Reported-by: syzbot+7fe7b81d602cc1e6b94d@syzkaller.appspotmail.com Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20240903160846.20909-1-toke@redhat.com Signed-off-by: Paolo Abeni --- net/sched/sch_cake.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 9602dafe32e6..d2f49db70523 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -786,12 +786,15 @@ skip_hash: * queue, accept the collision, update the host tags. */ q->way_collisions++; - if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { - q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; - q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; - } allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); + + if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { + if (allocate_src) + q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; + if (allocate_dst) + q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; + } found: /* reserve queue for future packets in same flow */ reduced_hash = outer_hash + k; From 8e69c96df771ab469cec278edb47009351de4da6 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Tue, 3 Sep 2024 22:33:41 +0200 Subject: [PATCH 57/61] net: dsa: vsc73xx: fix possible subblocks range of CAPT block CAPT block (CPU Capture Buffer) have 7 sublocks: 0-3, 4, 6, 7. Function 'vsc73xx_is_addr_valid' allows to use only block 0 at this moment. This patch fix it. Fixes: 05bd97fc559d ("net: dsa: Add Vitesse VSC73xx DSA router driver") Signed-off-by: Pawel Dembicki Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20240903203340.1518789-1-paweldembicki@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/vitesse-vsc73xx-core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index e3f95d2cc2c1..212421e9d42e 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -36,7 +36,7 @@ #define VSC73XX_BLOCK_ANALYZER 0x2 /* Only subblock 0 */ #define VSC73XX_BLOCK_MII 0x3 /* Subblocks 0 and 1 */ #define VSC73XX_BLOCK_MEMINIT 0x3 /* Only subblock 2 */ -#define VSC73XX_BLOCK_CAPTURE 0x4 /* Only subblock 2 */ +#define VSC73XX_BLOCK_CAPTURE 0x4 /* Subblocks 0-4, 6, 7 */ #define VSC73XX_BLOCK_ARBITER 0x5 /* Only subblock 0 */ #define VSC73XX_BLOCK_SYSTEM 0x7 /* Only subblock 0 */ @@ -410,13 +410,19 @@ int vsc73xx_is_addr_valid(u8 block, u8 subblock) break; case VSC73XX_BLOCK_MII: - case VSC73XX_BLOCK_CAPTURE: case VSC73XX_BLOCK_ARBITER: switch (subblock) { case 0 ... 1: return 1; } break; + case VSC73XX_BLOCK_CAPTURE: + switch (subblock) { + case 0 ... 4: + case 6 ... 7: + return 1; + } + break; } return 0; From e4af74a53b7aa865e7fcc104630ebb7a9129b71f Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Wed, 4 Sep 2024 16:12:26 +1000 Subject: [PATCH 58/61] selftests: net: enable bind tests bind_wildcard is compiled but not run, bind_timewait is not compiled. These two tests complete in a very short time, use the test harness properly, and seem reasonable to enable. The author of the tests confirmed via email that these were intended to be run. Enable these two tests. Fixes: 13715acf8ab5 ("selftest: Add test for bind() conflicts.") Fixes: 2c042e8e54ef ("tcp: Add selftest for bind() and TIME_WAIT.") Signed-off-by: Jamie Bainbridge Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/5a009b26cf5fb1ad1512d89c61b37e2fac702323.1725430322.git.jamie.bainbridge@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 8eaffd7a641c..9d5aa817411b 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -85,7 +85,8 @@ TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh TEST_GEN_FILES += sctp_hello TEST_GEN_FILES += ip_local_port_range -TEST_GEN_FILES += bind_wildcard +TEST_GEN_PROGS += bind_wildcard +TEST_GEN_PROGS += bind_timewait TEST_PROGS += test_vxlan_mdb.sh TEST_PROGS += test_bridge_neigh_suppress.sh TEST_PROGS += test_vxlan_nolocalbypass.sh From 20d664ebd212a85ad9c134e557619402bee6987f Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 4 Sep 2024 13:18:55 +0000 Subject: [PATCH 59/61] MAINTAINERS: fix ptp ocp driver maintainers address While checking the latest series for ptp_ocp driver I realised that MAINTAINERS file has wrong item about email on linux.dev domain. Fixes: 795fd9342c62 ("ptp_ocp: adjust MAINTAINERS and mailmap") Signed-off-by: Vadim Fedorenko Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240904131855.559078-1-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4502a16844e4..e6534d938c53 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17119,7 +17119,7 @@ F: include/dt-bindings/ OPENCOMPUTE PTP CLOCK DRIVER M: Jonathan Lemon -M: Vadim Fedorenko +M: Vadim Fedorenko L: netdev@vger.kernel.org S: Maintained F: drivers/ptp/ptp_ocp.c From 6fda63c45fe8a0870226c13dcce1cc21b7c4d508 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Wed, 4 Sep 2024 15:50:34 +0200 Subject: [PATCH 60/61] tools/net/ynl: fix cli.py --subscribe feature Execution of command: ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/dpll.yaml / --subscribe "monitor" --sleep 10 fails with: File "/repo/./tools/net/ynl/cli.py", line 109, in main ynl.check_ntf() File "/repo/tools/net/ynl/lib/ynl.py", line 924, in check_ntf op = self.rsp_by_value[nl_msg.cmd()] KeyError: 19 Parsing Generic Netlink notification messages performs lookup for op in the message. The message was not yet decoded, and is not yet considered GenlMsg, thus msg.cmd() returns Generic Netlink family id (19) instead of proper notification command id (i.e.: DPLL_CMD_PIN_CHANGE_NTF=13). Allow the op to be obtained within NetlinkProtocol.decode(..) itself if the op was not passed to the decode function, thus allow parsing of Generic Netlink notifications without causing the failure. Suggested-by: Donald Hunter Link: https://lore.kernel.org/netdev/m2le0n5xpn.fsf@gmail.com/ Fixes: 0a966d606c68 ("tools/net/ynl: Fix extack decoding for directional ops") Signed-off-by: Arkadiusz Kubalewski Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20240904135034.316033-1-arkadiusz.kubalewski@intel.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index d42c1d605969..c22c22bf2cb7 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -388,6 +388,8 @@ class NetlinkProtocol: def decode(self, ynl, nl_msg, op): msg = self._decode(nl_msg) + if op is None: + op = ynl.rsp_by_value[msg.cmd()] fixed_header_size = ynl._struct_size(op.fixed_header) msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size) return msg @@ -921,8 +923,7 @@ class YnlFamily(SpecFamily): print("Netlink done while checking for ntf!?") continue - op = self.rsp_by_value[nl_msg.cmd()] - decoded = self.nlproto.decode(self, nl_msg, op) + decoded = self.nlproto.decode(self, nl_msg, None) if decoded.cmd() not in self.async_msg_ids: print("Unexpected msg id done while checking for ntf", decoded) continue @@ -980,7 +981,7 @@ class YnlFamily(SpecFamily): if nl_msg.extack: self._decode_extack(req_msg, op, nl_msg.extack) else: - op = self.rsp_by_value[nl_msg.cmd()] + op = None req_flags = [] if nl_msg.error: From 031ae72825cef43e4650140b800ad58bf7a6a466 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Sep 2024 14:44:18 +0000 Subject: [PATCH 61/61] ila: call nf_unregister_net_hooks() sooner syzbot found an use-after-free Read in ila_nf_input [1] Issue here is that ila_xlat_exit_net() frees the rhashtable, then call nf_unregister_net_hooks(). It should be done in the reverse way, with a synchronize_rcu(). This is a good match for a pre_exit() method. [1] BUG: KASAN: use-after-free in rht_key_hashfn include/linux/rhashtable.h:159 [inline] BUG: KASAN: use-after-free in __rhashtable_lookup include/linux/rhashtable.h:604 [inline] BUG: KASAN: use-after-free in rhashtable_lookup include/linux/rhashtable.h:646 [inline] BUG: KASAN: use-after-free in rhashtable_lookup_fast+0x77a/0x9b0 include/linux/rhashtable.h:672 Read of size 4 at addr ffff888064620008 by task ksoftirqd/0/16 CPU: 0 UID: 0 PID: 16 Comm: ksoftirqd/0 Not tainted 6.11.0-rc4-syzkaller-00238-g2ad6d23f465a #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 Call Trace: __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 rht_key_hashfn include/linux/rhashtable.h:159 [inline] __rhashtable_lookup include/linux/rhashtable.h:604 [inline] rhashtable_lookup include/linux/rhashtable.h:646 [inline] rhashtable_lookup_fast+0x77a/0x9b0 include/linux/rhashtable.h:672 ila_lookup_wildcards net/ipv6/ila/ila_xlat.c:132 [inline] ila_xlat_addr net/ipv6/ila/ila_xlat.c:652 [inline] ila_nf_input+0x1fe/0x3c0 net/ipv6/ila/ila_xlat.c:190 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xc3/0x220 net/netfilter/core.c:626 nf_hook include/linux/netfilter.h:269 [inline] NF_HOOK+0x29e/0x450 include/linux/netfilter.h:312 __netif_receive_skb_one_core net/core/dev.c:5661 [inline] __netif_receive_skb+0x1ea/0x650 net/core/dev.c:5775 process_backlog+0x662/0x15b0 net/core/dev.c:6108 __napi_poll+0xcb/0x490 net/core/dev.c:6772 napi_poll net/core/dev.c:6841 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:6963 handle_softirqs+0x2c4/0x970 kernel/softirq.c:554 run_ksoftirqd+0xca/0x130 kernel/softirq.c:928 smpboot_thread_fn+0x544/0xa30 kernel/smpboot.c:164 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x64620 flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff) page_type: 0xbfffffff(buddy) raw: 00fff00000000000 ffffea0000959608 ffffea00019d9408 0000000000000000 raw: 0000000000000000 0000000000000003 00000000bfffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as freed page last allocated via order 3, migratetype Unmovable, gfp_mask 0x52dc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_ZERO), pid 5242, tgid 5242 (syz-executor), ts 73611328570, free_ts 618981657187 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1493 prep_new_page mm/page_alloc.c:1501 [inline] get_page_from_freelist+0x2e4c/0x2f10 mm/page_alloc.c:3439 __alloc_pages_noprof+0x256/0x6c0 mm/page_alloc.c:4695 __alloc_pages_node_noprof include/linux/gfp.h:269 [inline] alloc_pages_node_noprof include/linux/gfp.h:296 [inline] ___kmalloc_large_node+0x8b/0x1d0 mm/slub.c:4103 __kmalloc_large_node_noprof+0x1a/0x80 mm/slub.c:4130 __do_kmalloc_node mm/slub.c:4146 [inline] __kmalloc_node_noprof+0x2d2/0x440 mm/slub.c:4164 __kvmalloc_node_noprof+0x72/0x190 mm/util.c:650 bucket_table_alloc lib/rhashtable.c:186 [inline] rhashtable_init_noprof+0x534/0xa60 lib/rhashtable.c:1071 ila_xlat_init_net+0xa0/0x110 net/ipv6/ila/ila_xlat.c:613 ops_init+0x359/0x610 net/core/net_namespace.c:139 setup_net+0x515/0xca0 net/core/net_namespace.c:343 copy_net_ns+0x4e2/0x7b0 net/core/net_namespace.c:508 create_new_namespaces+0x425/0x7b0 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0x124/0x180 kernel/nsproxy.c:228 ksys_unshare+0x619/0xc10 kernel/fork.c:3328 __do_sys_unshare kernel/fork.c:3399 [inline] __se_sys_unshare kernel/fork.c:3397 [inline] __x64_sys_unshare+0x38/0x40 kernel/fork.c:3397 page last free pid 11846 tgid 11846 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1094 [inline] free_unref_page+0xd22/0xea0 mm/page_alloc.c:2612 __folio_put+0x2c8/0x440 mm/swap.c:128 folio_put include/linux/mm.h:1486 [inline] free_large_kmalloc+0x105/0x1c0 mm/slub.c:4565 kfree+0x1c4/0x360 mm/slub.c:4588 rhashtable_free_and_destroy+0x7c6/0x920 lib/rhashtable.c:1169 ila_xlat_exit_net+0x55/0x110 net/ipv6/ila/ila_xlat.c:626 ops_exit_list net/core/net_namespace.c:173 [inline] cleanup_net+0x802/0xcc0 net/core/net_namespace.c:640 process_one_work kernel/workqueue.c:3231 [inline] process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3312 worker_thread+0x86d/0xd40 kernel/workqueue.c:3390 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Memory state around the buggy address: ffff88806461ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88806461ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888064620000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff888064620080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888064620100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Fixes: 7f00feaf1076 ("ila: Add generic ILA translation facility") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Tom Herbert Reviewed-by: Florian Westphal Link: https://patch.msgid.link/20240904144418.1162839-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ila/ila.h | 1 + net/ipv6/ila/ila_main.c | 6 ++++++ net/ipv6/ila/ila_xlat.c | 13 +++++++++---- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index ad5f6f6ba333..85b92917849b 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -108,6 +108,7 @@ int ila_lwt_init(void); void ila_lwt_fini(void); int ila_xlat_init_net(struct net *net); +void ila_xlat_pre_exit_net(struct net *net); void ila_xlat_exit_net(struct net *net); int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index 69caed07315f..976c78efbae1 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -71,6 +71,11 @@ ila_xlat_init_fail: return err; } +static __net_exit void ila_pre_exit_net(struct net *net) +{ + ila_xlat_pre_exit_net(net); +} + static __net_exit void ila_exit_net(struct net *net) { ila_xlat_exit_net(net); @@ -78,6 +83,7 @@ static __net_exit void ila_exit_net(struct net *net) static struct pernet_operations ila_net_ops = { .init = ila_init_net, + .pre_exit = ila_pre_exit_net, .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 67e8c9440977..534a4498e280 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -619,6 +619,15 @@ int ila_xlat_init_net(struct net *net) return 0; } +void ila_xlat_pre_exit_net(struct net *net) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + + if (ilan->xlat.hooks_registered) + nf_unregister_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); +} + void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -626,10 +635,6 @@ void ila_xlat_exit_net(struct net *net) rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); free_bucket_spinlocks(ilan->xlat.locks); - - if (ilan->xlat.hooks_registered) - nf_unregister_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)