From 92c6058024e87087cf1b99b0389d67c0a886360e Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Thu, 10 Sep 2020 07:57:04 +0000 Subject: [PATCH 01/55] i40e: Fix flow for IPv6 next header (extension header) When a packet contains an IPv6 header with next header which is an extension header and not a protocol one, the kernel function skb_transport_header called with such sk_buff will return a pointer to the extension header and not to the TCP one. The above explained call caused a problem with packet processing for skb with encapsulation for tunnel with I40E_TX_CTX_EXT_IP_IPV6. The extension header was not skipped at all. The ipv6_skip_exthdr function does check if next header of the IPV6 header is an extension header and doesn't modify the l4_proto pointer if it points to a protocol header value so its safe to omit the comparison of exthdr and l4.hdr pointers. The ipv6_skip_exthdr can return value -1. This means that the skipping process failed and there is something wrong with the packet so it will be dropped. Fixes: a3fd9d8876a5 ("i40e/i40evf: Handle IPv6 extension headers in checksum offload") Signed-off-by: Slawomir Laba Signed-off-by: Przemyslaw Patynowski Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 4aca637d4a23..32d97315f3f5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3113,13 +3113,16 @@ static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags, l4_proto = ip.v4->protocol; } else if (*tx_flags & I40E_TX_FLAGS_IPV6) { + int ret; + tunnel |= I40E_TX_CTX_EXT_IP_IPV6; exthdr = ip.hdr + sizeof(*ip.v6); l4_proto = ip.v6->nexthdr; - if (l4.hdr != exthdr) - ipv6_skip_exthdr(skb, exthdr - skb->data, - &l4_proto, &frag_off); + ret = ipv6_skip_exthdr(skb, exthdr - skb->data, + &l4_proto, &frag_off); + if (ret < 0) + return -1; } /* define outer transport */ From 58cab46c622d6324e47bd1c533693c94498e4172 Mon Sep 17 00:00:00 2001 From: Keita Suzuki Date: Fri, 30 Oct 2020 07:14:30 +0000 Subject: [PATCH 02/55] i40e: Fix memory leak in i40e_probe Struct i40e_veb is allocated in function i40e_setup_pf_switch, and stored to an array field veb inside struct i40e_pf. However when i40e_setup_misc_vector fails, this memory leaks. Fix this by calling exit and teardown functions. Signed-off-by: Keita Suzuki Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1db482d310c2..84916261f5df 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -15122,6 +15122,8 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) { dev_info(&pdev->dev, "setup of misc vector failed: %d\n", err); + i40e_cloud_filter_exit(pf); + i40e_fdir_teardown(pf); goto err_vsis; } } From d2c788f739b6f68090e968a2ee31b543701e795f Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Fri, 20 Nov 2020 10:35:37 +0000 Subject: [PATCH 03/55] i40e: Add zero-initialization of AQ command structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zero-initialize AQ command data structures to comply with API specifications. Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Fixes: f4492db16df8 ("i40e: Add NPAR BW get and set functions") Signed-off-by: Andrzej Sawuła Signed-off-by: Mateusz Palczewski Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 84916261f5df..90c6c991aebc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7667,6 +7667,8 @@ int i40e_add_del_cloud_filter(struct i40e_vsi *vsi, if (filter->flags >= ARRAY_SIZE(flag_table)) return I40E_ERR_CONFIG; + memset(&cld_filter, 0, sizeof(cld_filter)); + /* copy element needed to add cloud filter from filter */ i40e_set_cld_element(filter, &cld_filter); @@ -7734,6 +7736,8 @@ int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi, !ipv6_addr_any(&filter->ip.v6.src_ip6)) return -EOPNOTSUPP; + memset(&cld_filter, 0, sizeof(cld_filter)); + /* copy element needed to add cloud filter from filter */ i40e_set_cld_element(filter, &cld_filter.element); @@ -11709,6 +11713,8 @@ i40e_status i40e_set_partition_bw_setting(struct i40e_pf *pf) struct i40e_aqc_configure_partition_bw_data bw_data; i40e_status status; + memset(&bw_data, 0, sizeof(bw_data)); + /* Set the valid bit for this PF */ bw_data.pf_valid_bits = cpu_to_le16(BIT(pf->hw.pf_id)); bw_data.max_bw[pf->hw.pf_id] = pf->max_bw & I40E_ALT_BW_VALUE_MASK; From 4cdb9f80dcd46aab3c0020b4a6920c22735c5d6e Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Tue, 24 Nov 2020 15:08:27 +0000 Subject: [PATCH 04/55] i40e: Fix overwriting flow control settings during driver loading During driver loading flow control settings were written to FW using a variable which was always zero, since it was being set only by ethtool. This behavior has been corrected and driver no longer overwrites the default FW/NVM settings. Fixes: 373149fc99a0 ("i40e: Decrease the scope of rtnl lock") Signed-off-by: Dawid Lukwinski Signed-off-by: Mateusz Palczewski Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 27 --------------------- 1 file changed, 27 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 90c6c991aebc..53efb3a53df2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -10005,7 +10005,6 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) int old_recovery_mode_bit = test_bit(__I40E_RECOVERY_MODE, pf->state); struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi]; struct i40e_hw *hw = &pf->hw; - u8 set_fc_aq_fail = 0; i40e_status ret; u32 val; int v; @@ -10131,13 +10130,6 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) i40e_stat_str(&pf->hw, ret), i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); - /* make sure our flow control settings are restored */ - ret = i40e_set_fc(&pf->hw, &set_fc_aq_fail, true); - if (ret) - dev_dbg(&pf->pdev->dev, "setting flow control: ret = %s last_status = %s\n", - i40e_stat_str(&pf->hw, ret), - i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); - /* Rebuild the VSIs and VEBs that existed before reset. * They are still in our local switch element arrays, so only * need to rebuild the switch model in the HW. @@ -14720,7 +14712,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) int err; u32 val; u32 i; - u8 set_fc_aq_fail; err = pci_enable_device_mem(pdev); if (err) @@ -15054,24 +15045,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } INIT_LIST_HEAD(&pf->vsi[pf->lan_vsi]->ch_list); - /* Make sure flow control is set according to current settings */ - err = i40e_set_fc(hw, &set_fc_aq_fail, true); - if (set_fc_aq_fail & I40E_SET_FC_AQ_FAIL_GET) - dev_dbg(&pf->pdev->dev, - "Set fc with err %s aq_err %s on get_phy_cap\n", - i40e_stat_str(hw, err), - i40e_aq_str(hw, hw->aq.asq_last_status)); - if (set_fc_aq_fail & I40E_SET_FC_AQ_FAIL_SET) - dev_dbg(&pf->pdev->dev, - "Set fc with err %s aq_err %s on set_phy_config\n", - i40e_stat_str(hw, err), - i40e_aq_str(hw, hw->aq.asq_last_status)); - if (set_fc_aq_fail & I40E_SET_FC_AQ_FAIL_UPDATE) - dev_dbg(&pf->pdev->dev, - "Set fc with err %s aq_err %s on get_link_info\n", - i40e_stat_str(hw, err), - i40e_aq_str(hw, hw->aq.asq_last_status)); - /* if FDIR VSI was set up, start it now */ for (i = 0; i < pf->num_alloc_vsi; i++) { if (pf->vsi[i] && pf->vsi[i]->type == I40E_VSI_FDIR) { From 28b1208e7a7fa3ddc9345b022bb93e53d9dcc28a Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Fri, 27 Nov 2020 10:39:03 +0000 Subject: [PATCH 05/55] i40e: Fix addition of RX filters after enabling FW LLDP agent Fix addition of VLAN filter for PF after enabling FW LLDP agent. Changing LLDP Agent causes FW to re-initialize per NVM settings. Remove default PF filter and move "Enable/Disable" to currently used reset flag. Without this patch PF would try to add MAC VLAN filter with default switch filter present. This causes AQ error and sets promiscuous mode on. Fixes: c65e78f87f81 ("i40e: Further implementation of LLDP") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Mateusz Palczewski Reviewed-by: Sylwester Dziedziuch Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 16 +++++++++------- drivers/net/ethernet/intel/i40e/i40e_main.c | 9 ++++----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 26ba1f3eb2d8..9e81f85ee2d8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -4878,7 +4878,7 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) enum i40e_admin_queue_err adq_err; struct i40e_vsi *vsi = np->vsi; struct i40e_pf *pf = vsi->back; - bool is_reset_needed; + u32 reset_needed = 0; i40e_status status; u32 i, j; @@ -4923,9 +4923,11 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) flags_complete: changed_flags = orig_flags ^ new_flags; - is_reset_needed = !!(changed_flags & (I40E_FLAG_VEB_STATS_ENABLED | - I40E_FLAG_LEGACY_RX | I40E_FLAG_SOURCE_PRUNING_DISABLED | - I40E_FLAG_DISABLE_FW_LLDP)); + if (changed_flags & I40E_FLAG_DISABLE_FW_LLDP) + reset_needed = I40E_PF_RESET_AND_REBUILD_FLAG; + if (changed_flags & (I40E_FLAG_VEB_STATS_ENABLED | + I40E_FLAG_LEGACY_RX | I40E_FLAG_SOURCE_PRUNING_DISABLED)) + reset_needed = BIT(__I40E_PF_RESET_REQUESTED); /* Before we finalize any flag changes, we need to perform some * checks to ensure that the changes are supported and safe. @@ -5057,7 +5059,7 @@ flags_complete: case I40E_AQ_RC_EEXIST: dev_warn(&pf->pdev->dev, "FW LLDP agent is already running\n"); - is_reset_needed = false; + reset_needed = 0; break; case I40E_AQ_RC_EPERM: dev_warn(&pf->pdev->dev, @@ -5086,8 +5088,8 @@ flags_complete: /* Issue reset to cause things to take effect, as additional bits * are added we will need to create a mask of bits requiring reset */ - if (is_reset_needed) - i40e_do_reset(pf, BIT(__I40E_PF_RESET_REQUESTED), true); + if (reset_needed) + i40e_do_reset(pf, reset_needed, true); return 0; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 53efb3a53df2..3505d641660b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -8537,11 +8537,6 @@ void i40e_do_reset(struct i40e_pf *pf, u32 reset_flags, bool lock_acquired) dev_dbg(&pf->pdev->dev, "PFR requested\n"); i40e_handle_reset_warning(pf, lock_acquired); - dev_info(&pf->pdev->dev, - pf->flags & I40E_FLAG_DISABLE_FW_LLDP ? - "FW LLDP is disabled\n" : - "FW LLDP is enabled\n"); - } else if (reset_flags & I40E_PF_RESET_AND_REBUILD_FLAG) { /* Request a PF Reset * @@ -8549,6 +8544,10 @@ void i40e_do_reset(struct i40e_pf *pf, u32 reset_flags, bool lock_acquired) */ i40e_prep_for_reset(pf, lock_acquired); i40e_reset_and_rebuild(pf, true, lock_acquired); + dev_info(&pf->pdev->dev, + pf->flags & I40E_FLAG_DISABLE_FW_LLDP ? + "FW LLDP is disabled\n" : + "FW LLDP is enabled\n"); } else if (reset_flags & BIT_ULL(__I40E_REINIT_REQUESTED)) { int v; From dc8812626440fa6a27f1f3f654f6dc435e042e42 Mon Sep 17 00:00:00 2001 From: Sylwester Dziedziuch Date: Fri, 27 Nov 2020 11:23:01 +0000 Subject: [PATCH 06/55] i40e: Fix VFs not created When creating VFs they were sometimes not getting resources. It was caused by not executing i40e_reset_all_vfs due to flag __I40E_VF_DISABLE being set on PF. Because of this IAVF was never able to finish setup sequence never getting reset indication from PF. Changed test_and_set_bit __I40E_VF_DISABLE in i40e_sync_filters_subtask to test_bit and removed clear_bit. This function should not set this bit it should only check if it hasn't been already set. Fixes: a7542b876075 ("i40e: check __I40E_VF_DISABLE bit in i40e_sync_filters_subtask") Signed-off-by: Sylwester Dziedziuch Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3505d641660b..2e22ab5a0f9a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -2616,7 +2616,7 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf) return; if (!test_and_clear_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state)) return; - if (test_and_set_bit(__I40E_VF_DISABLE, pf->state)) { + if (test_bit(__I40E_VF_DISABLE, pf->state)) { set_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state); return; } @@ -2634,7 +2634,6 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf) } } } - clear_bit(__I40E_VF_DISABLE, pf->state); } /** From 61c1e0eb8375def7c891bfe857bb795a57090526 Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Mon, 28 Dec 2020 11:38:00 +0100 Subject: [PATCH 07/55] i40e: Fix add TC filter for IPv6 Fix insufficient distinction between IPv4 and IPv6 addresses when creating a filter. IPv4 and IPv6 are kept in the same memory area. If IPv6 is added, then it's caught by IPv4 check, which leads to err -95. Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Signed-off-by: Grzegorz Szczurek Signed-off-by: Mateusz Palczewski Reviewed-by: Jaroslaw Gawin Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2e22ab5a0f9a..3e4a4d6f0419 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7731,7 +7731,8 @@ int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi, return -EOPNOTSUPP; /* adding filter using src_port/src_ip is not supported at this stage */ - if (filter->src_port || filter->src_ipv4 || + if (filter->src_port || + (filter->src_ipv4 && filter->n_proto != ETH_P_IPV6) || !ipv6_addr_any(&filter->ip.v6.src_ip6)) return -EOPNOTSUPP; @@ -7760,7 +7761,7 @@ int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi, cpu_to_le16(I40E_AQC_ADD_CLOUD_FILTER_MAC_VLAN_PORT); } - } else if (filter->dst_ipv4 || + } else if ((filter->dst_ipv4 && filter->n_proto != ETH_P_IPV6) || !ipv6_addr_any(&filter->ip.v6.dst_ip6)) { cld_filter.element.flags = cpu_to_le16(I40E_AQC_ADD_CLOUD_FILTER_IP_PORT); From b32cddd2247cf730731f93f1967d0147a40682c7 Mon Sep 17 00:00:00 2001 From: Norbert Ciosek Date: Fri, 5 Feb 2021 08:48:52 +0000 Subject: [PATCH 08/55] i40e: Fix endianness conversions Fixes the following sparse warnings: i40e_main.c:5953:32: warning: cast from restricted __le16 i40e_main.c:8008:29: warning: incorrect type in assignment (different base types) i40e_main.c:8008:29: expected unsigned int [assigned] [usertype] ipa i40e_main.c:8008:29: got restricted __le32 [usertype] i40e_main.c:8008:29: warning: incorrect type in assignment (different base types) i40e_main.c:8008:29: expected unsigned int [assigned] [usertype] ipa i40e_main.c:8008:29: got restricted __le32 [usertype] i40e_txrx.c:1950:59: warning: incorrect type in initializer (different base types) i40e_txrx.c:1950:59: expected unsigned short [usertype] vlan_tag i40e_txrx.c:1950:59: got restricted __le16 [usertype] l2tag1 i40e_txrx.c:1953:40: warning: cast to restricted __le16 i40e_xsk.c:448:38: warning: invalid assignment: |= i40e_xsk.c:448:38: left side has type restricted __le64 i40e_xsk.c:448:38: right side has type int Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Fixes: 2a508c64ad27 ("i40e: fix VLAN.TCI == 0 RX HW offload") Fixes: 3106c580fb7c ("i40e: Use batched xsk Tx interfaces to increase performance") Fixes: 8f88b3034db3 ("i40e: Add infrastructure for queue channel support") Signed-off-by: Norbert Ciosek Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 12 ++++++------ drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3e4a4d6f0419..4a2d03cada01 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -5920,7 +5920,7 @@ static int i40e_add_channel(struct i40e_pf *pf, u16 uplink_seid, ch->enabled_tc = !i40e_is_channel_macvlan(ch) && enabled_tc; ch->seid = ctxt.seid; ch->vsi_number = ctxt.vsi_number; - ch->stat_counter_idx = cpu_to_le16(ctxt.info.stat_counter_idx); + ch->stat_counter_idx = le16_to_cpu(ctxt.info.stat_counter_idx); /* copy just the sections touched not the entire info * since not all sections are valid as returned by @@ -7599,8 +7599,8 @@ static inline void i40e_set_cld_element(struct i40e_cloud_filter *filter, struct i40e_aqc_cloud_filters_element_data *cld) { - int i, j; u32 ipa; + int i; memset(cld, 0, sizeof(*cld)); ether_addr_copy(cld->outer_mac, filter->dst_mac); @@ -7611,14 +7611,14 @@ i40e_set_cld_element(struct i40e_cloud_filter *filter, if (filter->n_proto == ETH_P_IPV6) { #define IPV6_MAX_INDEX (ARRAY_SIZE(filter->dst_ipv6) - 1) - for (i = 0, j = 0; i < ARRAY_SIZE(filter->dst_ipv6); - i++, j += 2) { + for (i = 0; i < ARRAY_SIZE(filter->dst_ipv6); i++) { ipa = be32_to_cpu(filter->dst_ipv6[IPV6_MAX_INDEX - i]); - ipa = cpu_to_le32(ipa); - memcpy(&cld->ipaddr.raw_v6.data[j], &ipa, sizeof(ipa)); + + *(__le32 *)&cld->ipaddr.raw_v6.data[i * 2] = cpu_to_le32(ipa); } } else { ipa = be32_to_cpu(filter->dst_ipv4); + memcpy(&cld->ipaddr.v4.data, &ipa, sizeof(ipa)); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 32d97315f3f5..903d4e8cb0a1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1793,7 +1793,7 @@ void i40e_process_skb_fields(struct i40e_ring *rx_ring, skb_record_rx_queue(skb, rx_ring->queue_index); if (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) { - u16 vlan_tag = rx_desc->wb.qword0.lo_dword.l2tag1; + __le16 vlan_tag = rx_desc->wb.qword0.lo_dword.l2tag1; __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), le16_to_cpu(vlan_tag)); diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 492ce213208d..37a21fb99922 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -444,7 +444,7 @@ static void i40e_set_rs_bit(struct i40e_ring *xdp_ring) struct i40e_tx_desc *tx_desc; tx_desc = I40E_TX_DESC(xdp_ring, ntu); - tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT); + tx_desc->cmd_type_offset_bsz |= cpu_to_le64(I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT); } /** From 3a2eb515d1367c0f667b76089a6e727279c688b8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 19 Feb 2021 12:56:32 +0300 Subject: [PATCH 09/55] octeontx2-af: Fix an off by one in rvu_dbg_qsize_write() This code does not allocate enough memory for the NUL terminator so it ends up putting it one character beyond the end of the buffer. Fixes: 8756828a8148 ("octeontx2-af: Add NPA aura and pool contexts to debugfs") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index 094124b695dc..aa2ca8780b9c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -473,7 +473,7 @@ static ssize_t rvu_dbg_qsize_write(struct file *filp, u16 pcifunc; int ret, lf; - cmd_buf = memdup_user(buffer, count); + cmd_buf = memdup_user(buffer, count + 1); if (IS_ERR(cmd_buf)) return -ENOMEM; From 7dcf7aa01c7b9f18727cbe0f9cb4136f1c6cdcc2 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Wed, 2 Sep 2020 08:53:44 -0700 Subject: [PATCH 10/55] ice: report correct max number of TCs In the driver currently, we are reporting max number of TCs to the DCBNL callback as a kernel define set to 8. This is preventing userspace applications performing DCBx to correctly down map the TCs from requested to actual values. Report the actual max TC value to userspace from the capability struct. Fixes: b94b013eb626 ("ice: Implement DCBNL support") Signed-off-by: Dave Ertman Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index fcfefad00d1c..299bf7edbf82 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -134,7 +134,7 @@ ice_dcbnl_getnumtcs(struct net_device *dev, int __always_unused tcid, u8 *num) if (!test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags)) return -EINVAL; - *num = IEEE_8021QAZ_MAX_TCS; + *num = pf->hw.func_caps.common_cap.maxtc; return 0; } From 37b52be260024069f7f5bdcf304b5d72f77b022a Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 17 Sep 2020 13:13:37 -0700 Subject: [PATCH 11/55] ice: Set trusted VF as default VSI when setting allmulti on Currently the PF will only set a trusted VF as the default VSI when it requests FLAG_VF_UNICAST_PROMISC over VIRTCHNL. However, when FLAG_VF_MULTICAST_PROMISC is set it's expected that the trusted VF will see multicast packets that don't have a matching destination MAC in the devices internal switch. Fix this by setting the trusted VF as the default VSI if either FLAG_VF_UNICAST_PROMISC or FLAG_VF_MULTICAST_PROMISC is set. Fixes: 01b5e89aab49 ("ice: Add VF promiscuous support") Signed-off-by: Brett Creeley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index bf5fd812ea0e..07fae37a78be 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -2420,7 +2420,7 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg) } if (!test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, pf->flags)) { - bool set_dflt_vsi = !!(info->flags & FLAG_VF_UNICAST_PROMISC); + bool set_dflt_vsi = alluni || allmulti; if (set_dflt_vsi && !ice_is_dflt_vsi_in_use(pf->first_sw)) /* only attempt to set the default forwarding VSI if From a6aa7c8f998f4afddd73410aa043dad38162ce9e Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 17 Sep 2020 13:13:38 -0700 Subject: [PATCH 12/55] ice: Account for port VLAN in VF max packet size calculation Currently if an AVF driver doesn't account for the possibility of a port VLAN when determining its max packet size then packets at MTU will be dropped. It is not the VF driver's responsibility to account for a port VLAN so fix this. To fix this, do the following: 1. Add a function that determines the max packet size a VF is allowed by using the port's max packet size and whether the VF is in a port VLAN. If a port VLAN is configured then a VF's max packet size will always be the port's max packet size minus VLAN_HLEN. Otherwise it will be the port's max packet size. 2. Use this function to verify the max packet size from the VF. 3. If there is a port VLAN configured then add 4 bytes (VLAN_HLEN) to the VF's max packet size configuration. Also, the VIRTCHNL_OP_GET_VF_RESOURCES message provides the capability to communicate a VF's max packet size. Use the new function for this purpose. Fixes: 1071a8358a28 ("ice: Implement virtchnl commands for AVF support") Signed-off-by: Brett Creeley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ice/ice_virtchnl_pf.c | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 07fae37a78be..1f38a8d0c525 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -1918,6 +1918,29 @@ static int ice_vc_get_ver_msg(struct ice_vf *vf, u8 *msg) sizeof(struct virtchnl_version_info)); } +/** + * ice_vc_get_max_frame_size - get max frame size allowed for VF + * @vf: VF used to determine max frame size + * + * Max frame size is determined based on the current port's max frame size and + * whether a port VLAN is configured on this VF. The VF is not aware whether + * it's in a port VLAN so the PF needs to account for this in max frame size + * checks and sending the max frame size to the VF. + */ +static u16 ice_vc_get_max_frame_size(struct ice_vf *vf) +{ + struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx]; + struct ice_port_info *pi = vsi->port_info; + u16 max_frame_size; + + max_frame_size = pi->phy.link_info.max_frame_size; + + if (vf->port_vlan_info) + max_frame_size -= VLAN_HLEN; + + return max_frame_size; +} + /** * ice_vc_get_vf_res_msg * @vf: pointer to the VF info @@ -2000,6 +2023,7 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg) vfres->max_vectors = pf->num_msix_per_vf; vfres->rss_key_size = ICE_VSIQF_HKEY_ARRAY_SIZE; vfres->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE; + vfres->max_mtu = ice_vc_get_max_frame_size(vf); vfres->vsi_res[0].vsi_id = vf->lan_vsi_num; vfres->vsi_res[0].vsi_type = VIRTCHNL_VSI_SRIOV; @@ -2998,6 +3022,8 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) /* copy Rx queue info from VF into VSI */ if (qpi->rxq.ring_len > 0) { + u16 max_frame_size = ice_vc_get_max_frame_size(vf); + num_rxq++; vsi->rx_rings[i]->dma = qpi->rxq.dma_ring_addr; vsi->rx_rings[i]->count = qpi->rxq.ring_len; @@ -3010,7 +3036,7 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) } vsi->rx_buf_len = qpi->rxq.databuffer_size; vsi->rx_rings[i]->rx_buf_len = vsi->rx_buf_len; - if (qpi->rxq.max_pkt_size >= (16 * 1024) || + if (qpi->rxq.max_pkt_size > max_frame_size || qpi->rxq.max_pkt_size < 64) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; @@ -3018,6 +3044,11 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) } vsi->max_frame = qpi->rxq.max_pkt_size; + /* add space for the port VLAN since the VF driver is not + * expected to account for it in the MTU calculation + */ + if (vf->port_vlan_info) + vsi->max_frame += VLAN_HLEN; } /* VF can request to configure less than allocated queues or default From 0d4907f65dc8fc5e897ad19956fca1acb3b33bc8 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Fri, 20 Nov 2020 16:38:35 -0800 Subject: [PATCH 13/55] ice: Fix state bits on LLDP mode switch DCBX_CAP bits were not being adjusted when switching between SW and FW controlled LLDP. Adjust bits to correctly indicate which mode the LLDP logic is in. Fixes: b94b013eb626 ("ice: Implement DCBNL support") Signed-off-by: Dave Ertman Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 2 -- drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 4 ++++ drivers/net/ethernet/intel/ice/ice_ethtool.c | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index dae8280ce17c..357706444dd5 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -454,9 +454,7 @@ struct ice_pf { struct ice_hw_port_stats stats_prev; struct ice_hw hw; u8 stat_prev_loaded:1; /* has previous stats been loaded */ -#ifdef CONFIG_DCB u16 dcbx_cap; -#endif /* CONFIG_DCB */ u32 tx_timeout_count; unsigned long tx_timeout_last_recovery; u32 tx_timeout_recovery_level; diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index 299bf7edbf82..468a63f7eff9 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -159,6 +159,10 @@ static u8 ice_dcbnl_setdcbx(struct net_device *netdev, u8 mode) struct ice_pf *pf = ice_netdev_to_pf(netdev); struct ice_qos_cfg *qos_cfg; + /* if FW LLDP agent is running, DCBNL not allowed to change mode */ + if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) + return ICE_DCB_NO_HW_CHG; + /* No support for LLD_MANAGED modes or CEE+IEEE */ if ((mode & DCB_CAP_DCBX_LLD_MANAGED) || ((mode & DCB_CAP_DCBX_VER_IEEE) && (mode & DCB_CAP_DCBX_VER_CEE)) || diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 5636c9b23896..4001857788f8 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -8,6 +8,7 @@ #include "ice_fltr.h" #include "ice_lib.h" #include "ice_dcb_lib.h" +#include struct ice_stats { char stat_string[ETH_GSTRING_LEN]; @@ -1238,6 +1239,9 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags) status = ice_init_pf_dcb(pf, true); if (status) dev_warn(dev, "Fail to init DCB\n"); + + pf->dcbx_cap &= ~DCB_CAP_DCBX_LLD_MANAGED; + pf->dcbx_cap |= DCB_CAP_DCBX_HOST; } else { enum ice_status status; bool dcbx_agent_status; @@ -1280,6 +1284,9 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags) if (status) dev_dbg(dev, "Fail to enable MIB change events\n"); + pf->dcbx_cap &= ~DCB_CAP_DCBX_HOST; + pf->dcbx_cap |= DCB_CAP_DCBX_LLD_MANAGED; + ice_nway_reset(netdev); } } From 0393e46ac48a6832b1011c233ebcef84f8dbe4f5 Mon Sep 17 00:00:00 2001 From: Henry Tieman Date: Thu, 3 Dec 2020 09:20:24 -0800 Subject: [PATCH 14/55] ice: update the number of available RSS queues It was possible to have Rx queues that were not available for use by RSS. This would happen when increasing the number of Rx queues while there was a user defined RSS LUT. Always update the number of available RSS queues when changing the number of Rx queues. Fixes: 87324e747fde ("ice: Implement ethtool ops for channels") Signed-off-by: Henry Tieman Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 27 ++++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 4001857788f8..2dcfa685b763 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3328,6 +3328,18 @@ ice_get_channels(struct net_device *dev, struct ethtool_channels *ch) ch->max_other = ch->other_count; } +/** + * ice_get_valid_rss_size - return valid number of RSS queues + * @hw: pointer to the HW structure + * @new_size: requested RSS queues + */ +static int ice_get_valid_rss_size(struct ice_hw *hw, int new_size) +{ + struct ice_hw_common_caps *caps = &hw->func_caps.common_cap; + + return min_t(int, new_size, BIT(caps->rss_table_entry_width)); +} + /** * ice_vsi_set_dflt_rss_lut - set default RSS LUT with requested RSS size * @vsi: VSI to reconfigure RSS LUT on @@ -3355,14 +3367,10 @@ static int ice_vsi_set_dflt_rss_lut(struct ice_vsi *vsi, int req_rss_size) return -ENOMEM; /* set RSS LUT parameters */ - if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) { + if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) vsi->rss_size = 1; - } else { - struct ice_hw_common_caps *caps = &hw->func_caps.common_cap; - - vsi->rss_size = min_t(int, req_rss_size, - BIT(caps->rss_table_entry_width)); - } + else + vsi->rss_size = ice_get_valid_rss_size(hw, req_rss_size); /* create/set RSS LUT */ ice_fill_rss_lut(lut, vsi->rss_table_size, vsi->rss_size); @@ -3441,9 +3449,12 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) ice_vsi_recfg_qs(vsi, new_rx, new_tx); - if (new_rx && !netif_is_rxfh_configured(dev)) + if (!netif_is_rxfh_configured(dev)) return ice_vsi_set_dflt_rss_lut(vsi, new_rx); + /* Update rss_size due to change in Rx queues */ + vsi->rss_size = ice_get_valid_rss_size(&pf->hw, new_rx); + return 0; } From 04b385f325080157ab1b5f8ce1b1de07ce0d9e27 Mon Sep 17 00:00:00 2001 From: DENG Qingfang Date: Thu, 18 Feb 2021 11:45:14 +0800 Subject: [PATCH 15/55] net: ag71xx: remove unnecessary MTU reservation 2 bytes of the MTU are reserved for Atheros DSA tag, but DSA core has already handled that since commit dc0fe7d47f9f. Remove the unnecessary reservation. Fixes: d51b6ce441d3 ("net: ethernet: add ag71xx driver") Signed-off-by: DENG Qingfang Reviewed-by: Oleksij Rempel Link: https://lore.kernel.org/r/20210218034514.3421-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/atheros/ag71xx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index dd5c8a9038bb..a60ce9030581 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -223,8 +223,6 @@ #define AG71XX_REG_RX_SM 0x01b0 #define AG71XX_REG_TX_SM 0x01b4 -#define ETH_SWITCH_HEADER_LEN 2 - #define AG71XX_DEFAULT_MSG_ENABLE \ (NETIF_MSG_DRV \ | NETIF_MSG_PROBE \ @@ -933,7 +931,7 @@ static void ag71xx_hw_setup(struct ag71xx *ag) static unsigned int ag71xx_max_frame_len(unsigned int mtu) { - return ETH_SWITCH_HEADER_LEN + ETH_HLEN + VLAN_HLEN + mtu + ETH_FCS_LEN; + return ETH_HLEN + VLAN_HLEN + mtu + ETH_FCS_LEN; } static void ag71xx_hw_set_macaddr(struct ag71xx *ag, unsigned char *mac) From 433dfc99aa3e0acbf655b961d98eb690162f758f Mon Sep 17 00:00:00 2001 From: Camelia Groza Date: Thu, 18 Feb 2021 20:21:06 +0200 Subject: [PATCH 16/55] dpaa_eth: fix the access method for the dpaa_napi_portal The current use of container_of is flawed and unnecessary. Obtain the dpaa_napi_portal reference from the private percpu data instead. Fixes: a1e031ffb422 ("dpaa_eth: add XDP_REDIRECT support") Reported-by: Sascha Hauer Signed-off-by: Camelia Groza Acked-by: Madalin Bucur Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20210218182106.22613-1-camelia.groza@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index ccfe52a50a66..720dc99bd1fc 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2670,7 +2670,6 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct qman_portal *portal, u32 hash; u64 ns; - np = container_of(&portal, struct dpaa_napi_portal, p); dpaa_fq = container_of(fq, struct dpaa_fq, fq_base); fd_status = be32_to_cpu(fd->status); fd_format = qm_fd_get_format(fd); @@ -2685,6 +2684,7 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct qman_portal *portal, percpu_priv = this_cpu_ptr(priv->percpu_priv); percpu_stats = &percpu_priv->stats; + np = &percpu_priv->np; if (unlikely(dpaa_eth_napi_schedule(percpu_priv, portal, sched_napi))) return qman_cb_dqrr_stop; From 0a8a800027f124845c3ce0b5c3dfed6f268b13bb Mon Sep 17 00:00:00 2001 From: Stefan Chulski Date: Thu, 18 Feb 2021 14:42:03 +0200 Subject: [PATCH 17/55] net: mvpp2: skip RSS configurations on loopback port PPv2 loopback port doesn't support RSS, so we should skip RSS configurations for this port. Signed-off-by: Stefan Chulski Reviewed-by: Marcin Wojtas Link: https://lore.kernel.org/r/1613652123-19021-1-git-send-email-stefanc@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 0507369bb54d..1767c60056c5 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4699,9 +4699,10 @@ static void mvpp2_irqs_deinit(struct mvpp2_port *port) } } -static bool mvpp22_rss_is_supported(void) +static bool mvpp22_rss_is_supported(struct mvpp2_port *port) { - return queue_mode == MVPP2_QDIST_MULTI_MODE; + return (queue_mode == MVPP2_QDIST_MULTI_MODE) && + !(port->flags & MVPP2_F_LOOPBACK); } static int mvpp2_open(struct net_device *dev) @@ -5513,7 +5514,7 @@ static int mvpp2_ethtool_get_rxnfc(struct net_device *dev, struct mvpp2_port *port = netdev_priv(dev); int ret = 0, i, loc = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; switch (info->cmd) { @@ -5548,7 +5549,7 @@ static int mvpp2_ethtool_set_rxnfc(struct net_device *dev, struct mvpp2_port *port = netdev_priv(dev); int ret = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; switch (info->cmd) { @@ -5569,7 +5570,9 @@ static int mvpp2_ethtool_set_rxnfc(struct net_device *dev, static u32 mvpp2_ethtool_get_rxfh_indir_size(struct net_device *dev) { - return mvpp22_rss_is_supported() ? MVPP22_RSS_TABLE_ENTRIES : 0; + struct mvpp2_port *port = netdev_priv(dev); + + return mvpp22_rss_is_supported(port) ? MVPP22_RSS_TABLE_ENTRIES : 0; } static int mvpp2_ethtool_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, @@ -5578,7 +5581,7 @@ static int mvpp2_ethtool_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, struct mvpp2_port *port = netdev_priv(dev); int ret = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; if (indir) @@ -5596,7 +5599,7 @@ static int mvpp2_ethtool_set_rxfh(struct net_device *dev, const u32 *indir, struct mvpp2_port *port = netdev_priv(dev); int ret = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_CRC32) @@ -5617,7 +5620,7 @@ static int mvpp2_ethtool_get_rxfh_context(struct net_device *dev, u32 *indir, struct mvpp2_port *port = netdev_priv(dev); int ret = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; if (rss_context >= MVPP22_N_RSS_TABLES) return -EINVAL; @@ -5639,7 +5642,7 @@ static int mvpp2_ethtool_set_rxfh_context(struct net_device *dev, struct mvpp2_port *port = netdev_priv(dev); int ret; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_CRC32) @@ -5956,7 +5959,7 @@ static int mvpp2_port_init(struct mvpp2_port *port) mvpp2_cls_oversize_rxq_set(port); mvpp2_cls_port_config(port); - if (mvpp22_rss_is_supported()) + if (mvpp22_rss_is_supported(port)) mvpp22_port_rss_init(port); /* Provide an initial Rx packet size */ @@ -6861,7 +6864,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, dev->hw_features |= features | NETIF_F_RXCSUM | NETIF_F_GRO | NETIF_F_HW_VLAN_CTAG_FILTER; - if (mvpp22_rss_is_supported()) { + if (mvpp22_rss_is_supported(port)) { dev->hw_features |= NETIF_F_RXHASH; dev->features |= NETIF_F_NTUPLE; } From 4e9d9d1f4880ad358a8e5eb6ac4c811fd76dd617 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 19 Feb 2021 13:10:44 +0300 Subject: [PATCH 18/55] net: phy: icplus: call phy_restore_page() when phy_select_page() fails The comments to phy_select_page() say that "phy_restore_page() must always be called after this, irrespective of success or failure of this call." If we don't call phy_restore_page() then we are still holding the phy_lock_mdio_bus() so it eventually leads to a dead lock. Fixes: 32ab60e53920 ("net: phy: icplus: add MDI/MDIX support for IP101A/G") Fixes: f9bc51e6cce2 ("net: phy: icplus: fix paged register access") Signed-off-by: Dan Carpenter Reviewed-by: Michael Walle Reviewed-by: Russell King Link: https://lore.kernel.org/r/YC+OpFGsDPXPnXM5@mwanda Signed-off-by: Jakub Kicinski --- drivers/net/phy/icplus.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c index 3e431737c1ba..a00a667454a9 100644 --- a/drivers/net/phy/icplus.c +++ b/drivers/net/phy/icplus.c @@ -239,7 +239,7 @@ static int ip101a_g_config_intr_pin(struct phy_device *phydev) oldpage = phy_select_page(phydev, IP101G_DEFAULT_PAGE); if (oldpage < 0) - return oldpage; + goto out; /* configure the RXER/INTR_32 pin of the 32-pin IP101GR if needed: */ switch (priv->sel_intr32) { @@ -314,7 +314,7 @@ static int ip101a_g_read_status(struct phy_device *phydev) oldpage = phy_select_page(phydev, IP101G_DEFAULT_PAGE); if (oldpage < 0) - return oldpage; + goto out; ret = __phy_read(phydev, IP10XX_SPEC_CTRL_STATUS); if (ret < 0) @@ -349,7 +349,8 @@ out: static int ip101a_g_config_mdix(struct phy_device *phydev) { u16 ctrl = 0, ctrl2 = 0; - int oldpage, ret; + int oldpage; + int ret = 0; switch (phydev->mdix_ctrl) { case ETH_TP_MDI: @@ -367,7 +368,7 @@ static int ip101a_g_config_mdix(struct phy_device *phydev) oldpage = phy_select_page(phydev, IP101G_DEFAULT_PAGE); if (oldpage < 0) - return oldpage; + goto out; ret = __phy_modify(phydev, IP10XX_SPEC_CTRL_STATUS, IP101A_G_AUTO_MDIX_DIS, ctrl); From 94ead4caa0615f4b0719ffcb4dbd0907fe2f9265 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Feb 2021 21:12:21 -0800 Subject: [PATCH 19/55] net: dsa: Fix dependencies with HSR The core DSA framework uses hsr_is_master() which would not resolve to a valid symbol if HSR is built-into the kernel and DSA is a module. Fixes: 18596f504a3e ("net: dsa: add support for offloading HSR") Reported-by: kernel test robot Signed-off-by: Florian Fainelli Reviewed-by: George McCollister Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://lore.kernel.org/r/20210220051222.15672-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- net/dsa/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index a45572cfb71a..3589224c8da9 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,6 +9,7 @@ menuconfig NET_DSA tristate "Distributed Switch Architecture" depends on HAVE_NET_DSA depends on BRIDGE || BRIDGE=n + depends on HSR || HSR=n select GRO_CELLS select NET_SWITCHDEV select PHYLINK From 341c65242fe18aac8900e4291d472df9f7ba7bc7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 19 Feb 2021 18:35:37 +0100 Subject: [PATCH 20/55] mptcp: fix DATA_FIN processing for orphaned sockets Currently we move orphaned msk sockets directly from FIN_WAIT2 state to CLOSE, with the rationale that incoming additional data could be just dropped by the TCP stack/TW sockets. Anyhow we miss sending MPTCP-level ack on incoming DATA_FIN, and that may hang the peers. Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a57f3eab7b6a..b1075fc3d3b3 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2264,13 +2264,12 @@ static void mptcp_worker(struct work_struct *work) __mptcp_check_send_data_fin(sk); mptcp_check_data_fin(sk); - /* if the msk data is completely acked, or the socket timedout, - * there is no point in keeping around an orphaned sk + /* There is no point in keeping around an orphaned sk timedout or + * closed, but we need the msk around to reply to incoming DATA_FIN, + * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD) && - (mptcp_check_close_timeout(sk) || - (state != sk->sk_state && - ((1 << inet_sk_state_load(sk)) & (TCPF_CLOSE | TCPF_FIN_WAIT2))))) { + (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { inet_sk_state_store(sk, TCP_CLOSE); __mptcp_destroy_sock(sk); goto unlock; From d87903b63e3ce1eafaa701aec5cc1d0ecd0d84dc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 19 Feb 2021 18:35:38 +0100 Subject: [PATCH 21/55] mptcp: fix DATA_FIN generation on early shutdown If the msk is closed before sending or receiving any data, no DATA_FIN is generated, instead an MPC ack packet is crafted out. In the above scenario, the MPTCP protocol creates and sends a pure ack and such packets matches also the criteria for an MPC ack and the protocol tries first to insert MPC options, leading to the described error. This change addresses the issue by avoiding the insertion of an MPC option for DATA_FIN packets or if the sub-flow is not established. To avoid doing multiple times the same test, fetch the data_fin flag in a bool variable and pass it to both the interested helpers. Fixes: 6d0060f600ad ("mptcp: Write MPTCP DSS headers to outgoing data packets") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b63574d6b812..444a38681e93 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -411,6 +411,7 @@ static void clear_3rdack_retransmission(struct sock *sk) } static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, + bool snd_data_fin_enable, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -428,9 +429,10 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, if (!skb) return false; - /* MPC/MPJ needed only on 3rd ack packet */ - if (subflow->fully_established || - subflow->snd_isn != TCP_SKB_CB(skb)->seq) + /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ + if (subflow->fully_established || snd_data_fin_enable || + subflow->snd_isn != TCP_SKB_CB(skb)->seq || + sk->sk_state != TCP_ESTABLISHED) return false; if (subflow->mp_capable) { @@ -502,20 +504,20 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, } static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, + bool snd_data_fin_enable, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - u64 snd_data_fin_enable, ack_seq; unsigned int dss_size = 0; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; + u64 ack_seq; mpext = skb ? mptcp_get_ext(skb) : NULL; - snd_data_fin_enable = mptcp_data_fin_enabled(msk); if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; @@ -717,12 +719,15 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int opt_size = 0; + bool snd_data_fin; bool ret = false; opts->suboptions = 0; - if (unlikely(mptcp_check_fallback(sk))) + if (unlikely(__mptcp_check_fallback(msk))) return false; /* prevent adding of any MPTCP related options on reset packet @@ -731,10 +736,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) return false; - if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) + snd_data_fin = mptcp_data_fin_enabled(msk); + if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) ret = true; - else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, - opts)) + else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) ret = true; /* we reserved enough space for the above options, and exceeding the From ad98dd37051e14fa8c785609430d907fcfd518ba Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 19 Feb 2021 18:35:39 +0100 Subject: [PATCH 22/55] mptcp: provide subflow aware release function mptcp re-used inet(6)_release, so the subflow sockets are ignored. Need to invoke ip(v6)_mc_drop_socket function to ensure mcast join resources get free'd. Fixes: 717e79c867ca5 ("mptcp: Add setsockopt()/getsockopt() socket operations") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/110 Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 55 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b1075fc3d3b3..c5d5e68940ea 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,7 @@ #include #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include +#include #endif #include #include @@ -3374,10 +3376,34 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, return mask; } +static int mptcp_release(struct socket *sock) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = sock->sk; + struct mptcp_sock *msk; + + if (!sk) + return 0; + + lock_sock(sk); + + msk = mptcp_sk(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ip_mc_drop_socket(ssk); + } + + release_sock(sk); + + return inet_release(sock); +} + static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, - .release = inet_release, + .release = mptcp_release, .bind = mptcp_bind, .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, @@ -3469,10 +3495,35 @@ void __init mptcp_proto_init(void) } #if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int mptcp6_release(struct socket *sock) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + lock_sock(sk); + + msk = mptcp_sk(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ip_mc_drop_socket(ssk); + ipv6_sock_mc_close(ssk); + ipv6_sock_ac_close(ssk); + } + + release_sock(sk); + return inet6_release(sock); +} + static const struct proto_ops mptcp_v6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, - .release = inet6_release, + .release = mptcp6_release, .bind = mptcp_bind, .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, From 52557dbc7538ecceb27ef2206719a47a8039a335 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 19 Feb 2021 18:35:40 +0100 Subject: [PATCH 23/55] mptcp: do not wakeup listener for MPJ subflows MPJ subflows are not exposed as fds to user spaces. As such, incoming MPJ subflows are removed from the accept queue by tcp_check_req()/tcp_get_cookie_sock(). Later tcp_child_process() invokes subflow_data_ready() on the parent socket regardless of the subflow kind, leading to poll wakeups even if the later accept will block. Address the issue by double-checking the queue state before waking the user-space. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/164 Reported-by: Dr. David Alan Gilbert Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 06e233410e0e..e1fbcab257e6 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1096,6 +1096,12 @@ static void subflow_data_ready(struct sock *sk) msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { + /* MPJ subflow are removed from accept queue before reaching here, + * avoid stray wakeups + */ + if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) + return; + set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; From 24877687b375f2c476ffb726ea915fc85df09e3d Mon Sep 17 00:00:00 2001 From: "Song, Yoong Siang" Date: Thu, 18 Feb 2021 21:40:53 +0800 Subject: [PATCH 24/55] net: stmmac: fix CBS idleslope and sendslope calculation When link speed is not 100 Mbps, port transmit rate and speed divider are set to 8 and 1000000 respectively. These values are incorrect for CBS idleslope and sendslope HW values calculation if the link speed is not 1 Gbps. This patch adds switch statement to set the values of port transmit rate and speed divider for 10 Gbps, 5 Gbps, 2.5 Gbps, 1 Gbps, and 100 Mbps. Note that CBS is not supported at 10 Mbps. Fixes: bc41a6689b30 ("net: stmmac: tc: Remove the speed dependency") Fixes: 1f705bc61aee ("net: stmmac: Add support for CBS QDISC") Signed-off-by: Song, Yoong Siang Link: https://lore.kernel.org/r/1613655653-11755-1-git-send-email-yoong.siang.song@intel.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/stmmac_tc.c | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 56985542e202..44bb133c3000 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -316,6 +316,32 @@ static int tc_setup_cbs(struct stmmac_priv *priv, if (!priv->dma_cap.av) return -EOPNOTSUPP; + /* Port Transmit Rate and Speed Divider */ + switch (priv->speed) { + case SPEED_10000: + ptr = 32; + speed_div = 10000000; + break; + case SPEED_5000: + ptr = 32; + speed_div = 5000000; + break; + case SPEED_2500: + ptr = 8; + speed_div = 2500000; + break; + case SPEED_1000: + ptr = 8; + speed_div = 1000000; + break; + case SPEED_100: + ptr = 4; + speed_div = 100000; + break; + default: + return -EOPNOTSUPP; + } + mode_to_use = priv->plat->tx_queues_cfg[queue].mode_to_use; if (mode_to_use == MTL_QUEUE_DCB && qopt->enable) { ret = stmmac_dma_qmode(priv, priv->ioaddr, queue, MTL_QUEUE_AVB); @@ -332,10 +358,6 @@ static int tc_setup_cbs(struct stmmac_priv *priv, priv->plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB; } - /* Port Transmit Rate and Speed Divider */ - ptr = (priv->speed == SPEED_100) ? 4 : 8; - speed_div = (priv->speed == SPEED_100) ? 100000 : 1000000; - /* Final adjustments for HW */ value = div_s64(qopt->idleslope * 1024ll * ptr, speed_div); priv->plat->tx_queues_cfg[queue].idle_slope = value & GENMASK(31, 0); From 8eb65fda4a6dbd59cd5de24b106a10b6ee0d2176 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Sun, 21 Feb 2021 22:35:59 +0800 Subject: [PATCH 25/55] net/mlx4_core: Add missed mlx4_free_cmd_mailbox() mlx4_do_mirror_rule() forgets to call mlx4_free_cmd_mailbox() to free the memory region allocated by mlx4_alloc_cmd_mailbox() before an exit. Add the missed call to fix it. Fixes: 78efed275117 ("net/mlx4_core: Support mirroring VF DMFS rules on both ports") Signed-off-by: Chuhong Yuan Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20210221143559.390277-1-hslester96@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 394f43add85c..a99e71bc7b3c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -4986,6 +4986,7 @@ static int mlx4_do_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule if (!fs_rule->mirr_mbox) { mlx4_err(dev, "rule mirroring mailbox is null\n"); + mlx4_free_cmd_mailbox(dev, mailbox); return -EINVAL; } memcpy(mailbox->buf, fs_rule->mirr_mbox, fs_rule->mirr_mbox_size); From ee576c47db60432c37e54b1e2b43a8ca6d3a8dca Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 23 Feb 2021 14:18:58 +0100 Subject: [PATCH 26/55] net: icmp: pass zeroed opts from icmp{,v6}_ndo_send before sending The icmp{,v6}_send functions make all sorts of use of skb->cb, casting it with IPCB or IP6CB, assuming the skb to have come directly from the inet layer. But when the packet comes from the ndo layer, especially when forwarded, there's no telling what might be in skb->cb at that point. As a result, the icmp sending code risks reading bogus memory contents, which can result in nasty stack overflows such as this one reported by a user: panic+0x108/0x2ea __stack_chk_fail+0x14/0x20 __icmp_send+0x5bd/0x5c0 icmp_ndo_send+0x148/0x160 In icmp_send, skb->cb is cast with IPCB and an ip_options struct is read from it. The optlen parameter there is of particular note, as it can induce writes beyond bounds. There are quite a few ways that can happen in __ip_options_echo. For example: // sptr/skb are attacker-controlled skb bytes sptr = skb_network_header(skb); // dptr/dopt points to stack memory allocated by __icmp_send dptr = dopt->__data; // sopt is the corrupt skb->cb in question if (sopt->rr) { optlen = sptr[sopt->rr+1]; // corrupt skb->cb + skb->data soffset = sptr[sopt->rr+2]; // corrupt skb->cb + skb->data // this now writes potentially attacker-controlled data, over // flowing the stack: memcpy(dptr, sptr+sopt->rr, optlen); } In the icmpv6_send case, the story is similar, but not as dire, as only IP6CB(skb)->iif and IP6CB(skb)->dsthao are used. The dsthao case is worse than the iif case, but it is passed to ipv6_find_tlv, which does a bit of bounds checking on the value. This is easy to simulate by doing a `memset(skb->cb, 0x41, sizeof(skb->cb));` before calling icmp{,v6}_ndo_send, and it's only by good fortune and the rarity of icmp sending from that context that we've avoided reports like this until now. For example, in KASAN: BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0xa0e/0x12b0 Write of size 38 at addr ffff888006f1f80e by task ping/89 CPU: 2 PID: 89 Comm: ping Not tainted 5.10.0-rc7-debug+ #5 Call Trace: dump_stack+0x9a/0xcc print_address_description.constprop.0+0x1a/0x160 __kasan_report.cold+0x20/0x38 kasan_report+0x32/0x40 check_memory_region+0x145/0x1a0 memcpy+0x39/0x60 __ip_options_echo+0xa0e/0x12b0 __icmp_send+0x744/0x1700 Actually, out of the 4 drivers that do this, only gtp zeroed the cb for the v4 case, while the rest did not. So this commit actually removes the gtp-specific zeroing, while putting the code where it belongs in the shared infrastructure of icmp{,v6}_ndo_send. This commit fixes the issue by passing an empty IPCB or IP6CB along to the functions that actually do the work. For the icmp_send, this was already trivial, thanks to __icmp_send providing the plumbing function. For icmpv6_send, this required a tiny bit of refactoring to make it behave like the v4 case, after which it was straight forward. Fixes: a2b78e9b2cac ("sunvnet: generate ICMP PTMUD messages for smaller port MTUs") Reported-by: SinYu Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/netdev/CAF=yD-LOF116aHub6RMe8vB8ZpnrrnoTdqhobEx+bvoA8AsP0w@mail.gmail.com/T/ Signed-off-by: Jason A. Donenfeld Link: https://lore.kernel.org/r/20210223131858.72082-1-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 1 - include/linux/icmpv6.h | 26 ++++++++++++++++++++------ include/linux/ipv6.h | 1 - include/net/icmp.h | 6 +++++- net/ipv4/icmp.c | 5 +++-- net/ipv6/icmp.c | 18 +++++++++--------- net/ipv6/ip6_icmp.c | 12 +++++++----- 7 files changed, 44 insertions(+), 25 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 9a70f05baf6e..39c00f050fbd 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -543,7 +543,6 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && mtu < ntohs(iph->tot_len)) { netdev_dbg(dev, "packet too big, fragmentation needed\n"); - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto err_rt; diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index 452d8978ffc7..9055cb380ee2 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -3,6 +3,7 @@ #define _LINUX_ICMPV6_H #include +#include #include static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) @@ -15,13 +16,16 @@ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) #if IS_ENABLED(CONFIG_IPV6) typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr); + const struct in6_addr *force_saddr, + const struct inet6_skb_parm *parm); void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr); + const struct in6_addr *force_saddr, + const struct inet6_skb_parm *parm); #if IS_BUILTIN(CONFIG_IPV6) -static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct inet6_skb_parm *parm) { - icmp6_send(skb, type, code, info, NULL); + icmp6_send(skb, type, code, info, NULL, parm); } static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { @@ -34,18 +38,28 @@ static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) return 0; } #else -extern void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info); +extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct inet6_skb_parm *parm); extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); #endif +static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +{ + __icmpv6_send(skb, type, code, info, IP6CB(skb)); +} + int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len); #if IS_ENABLED(CONFIG_NF_NAT) void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); #else -#define icmpv6_ndo_send icmpv6_send +static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) +{ + struct inet6_skb_parm parm = { 0 }; + __icmpv6_send(skb_in, type, code, info, &parm); +} #endif #else diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 9d1f29f0c512..70b2ad3b9884 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -85,7 +85,6 @@ struct ipv6_params { __s32 autoconf; }; extern struct ipv6_params ipv6_defaults; -#include #include #include diff --git a/include/net/icmp.h b/include/net/icmp.h index 9ac2d2672a93..fd84adc47963 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -46,7 +46,11 @@ static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 #if IS_ENABLED(CONFIG_NF_NAT) void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); #else -#define icmp_ndo_send icmp_send +static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + struct ip_options opts = { 0 }; + __icmp_send(skb_in, type, code, info, &opts); +} #endif int icmp_rcv(struct sk_buff *skb); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 396b492c804f..616e2dc1c8fa 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -775,13 +775,14 @@ EXPORT_SYMBOL(__icmp_send); void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct sk_buff *cloned_skb = NULL; + struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; struct nf_conn *ct; __be32 orig_ip; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { - icmp_send(skb_in, type, code, info); + __icmp_send(skb_in, type, code, info, &opts); return; } @@ -796,7 +797,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) orig_ip = ip_hdr(skb_in)->saddr; ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; - icmp_send(skb_in, type, code, info); + __icmp_send(skb_in, type, code, info, &opts); ip_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index f3d05866692e..fd1f896115c1 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -331,10 +331,9 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st } #if IS_ENABLED(CONFIG_IPV6_MIP6) -static void mip6_addr_swap(struct sk_buff *skb) +static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) { struct ipv6hdr *iph = ipv6_hdr(skb); - struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6_destopt_hao *hao; struct in6_addr tmp; int off; @@ -351,7 +350,7 @@ static void mip6_addr_swap(struct sk_buff *skb) } } #else -static inline void mip6_addr_swap(struct sk_buff *skb) {} +static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {} #endif static struct dst_entry *icmpv6_route_lookup(struct net *net, @@ -446,7 +445,8 @@ static int icmp6_iif(const struct sk_buff *skb) * Send an ICMP message in response to a packet in error */ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr) + const struct in6_addr *force_saddr, + const struct inet6_skb_parm *parm) { struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -542,7 +542,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) goto out_bh_enable; - mip6_addr_swap(skb); + mip6_addr_swap(skb, parm); sk = icmpv6_xmit_lock(net); if (!sk) @@ -559,7 +559,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, /* select a more meaningful saddr from input if */ struct net_device *in_netdev; - in_netdev = dev_get_by_index(net, IP6CB(skb)->iif); + in_netdev = dev_get_by_index(net, parm->iif); if (in_netdev) { ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, inet6_sk(sk)->srcprefs, @@ -640,7 +640,7 @@ EXPORT_SYMBOL(icmp6_send); */ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { - icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL); + icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb)); kfree_skb(skb); } @@ -697,10 +697,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, } if (type == ICMP_TIME_EXCEEDED) icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, - info, &temp_saddr); + info, &temp_saddr, IP6CB(skb2)); else icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, - info, &temp_saddr); + info, &temp_saddr, IP6CB(skb2)); if (rt) ip6_rt_put(rt); diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 70c8c2f36c98..9e3574880cb0 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -33,23 +33,25 @@ int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) } EXPORT_SYMBOL(inet6_unregister_icmp_sender); -void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) - send(skb, type, code, info, NULL); + send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } -EXPORT_SYMBOL(icmpv6_send); +EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { + struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; struct in6_addr orig_ip; @@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { - icmpv6_send(skb_in, type, code, info); + __icmpv6_send(skb_in, type, code, info, &parm); return; } @@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) orig_ip = ipv6_hdr(skb_in)->saddr; ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; - icmpv6_send(skb_in, type, code, info); + __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); From 9bc1ef64aeb6f7dae17e98f912213266738ddcfe Mon Sep 17 00:00:00 2001 From: Sieng Piaw Liew Date: Mon, 22 Feb 2021 09:35:30 +0800 Subject: [PATCH 27/55] bcm63xx_enet: fix sporadic kernel panic In ndo_stop functions, netdev_completed_queue() is called during forced tx reclaim, after netdev_reset_queue(). This may trigger kernel panic if there is any tx skb left. This patch moves netdev_reset_queue() to after tx reclaim, so BQL can complete successfully then reset. Signed-off-by: Sieng Piaw Liew Fixes: 4c59b0f5543d ("bcm63xx_enet: add BQL support") Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20210222013530.1356-1-liew.s.piaw@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index fd8767213165..977f097fc7bf 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1192,7 +1192,6 @@ static int bcm_enet_stop(struct net_device *dev) kdev = &priv->pdev->dev; netif_stop_queue(dev); - netdev_reset_queue(dev); napi_disable(&priv->napi); if (priv->has_phy) phy_stop(dev->phydev); @@ -1231,6 +1230,9 @@ static int bcm_enet_stop(struct net_device *dev) if (priv->has_phy) phy_disconnect(dev->phydev); + /* reset BQL after forced tx reclaim to prevent kernel panic */ + netdev_reset_queue(dev); + return 0; } @@ -2343,7 +2345,6 @@ static int bcm_enetsw_stop(struct net_device *dev) del_timer_sync(&priv->swphy_poll); netif_stop_queue(dev); - netdev_reset_queue(dev); napi_disable(&priv->napi); del_timer_sync(&priv->rx_timeout); @@ -2371,6 +2372,9 @@ static int bcm_enetsw_stop(struct net_device *dev) free_irq(priv->irq_tx, dev); free_irq(priv->irq_rx, dev); + /* reset BQL after forced tx reclaim to prevent kernel panic */ + netdev_reset_queue(dev); + return 0; } From 18755e270666ce869289bceb734d25eae2be9da9 Mon Sep 17 00:00:00 2001 From: Krzysztof Halasa Date: Thu, 18 Feb 2021 13:34:42 +0100 Subject: [PATCH 28/55] Marvell Sky2 Ethernet adapter: fix warning messages. sky2.c driver uses netdev_warn() before the net device is initialized. Fix it by using dev_warn() instead. Signed-off-by: Krzysztof Halasa Link: https://lore.kernel.org/r/m3a6s1r1ul.fsf@t19.piap.pl Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/sky2.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index ebe1406c6e64..dbec8e187a68 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -4806,12 +4806,11 @@ static struct net_device *sky2_init_netdev(struct sky2_hw *hw, unsigned port, if (!is_valid_ether_addr(dev->dev_addr)) { struct sockaddr sa = { AF_UNSPEC }; - netdev_warn(dev, - "Invalid MAC address, defaulting to random\n"); + dev_warn(&hw->pdev->dev, "Invalid MAC address, defaulting to random\n"); eth_hw_addr_random(dev); memcpy(sa.sa_data, dev->dev_addr, ETH_ALEN); if (sky2_set_mac_address(dev, &sa)) - netdev_warn(dev, "Failed to set MAC address.\n"); + dev_warn(&hw->pdev->dev, "Failed to set MAC address.\n"); } return dev; From e6dd86ed27d1a56bd45c50f6cc238a94c283e8e2 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 22 Feb 2021 14:30:09 -0800 Subject: [PATCH 29/55] net: dsa: bcm_sf2: Wire-up br_flags_pre, br_flags and set_mrouter Because bcm_sf2 implements its own dsa_switch_ops we need to export the b53_br_flags_pre(), b53_br_flags() and b53_set_mrouter so we can wire-up them up like they used to be with the former b53_br_egress_floods(). Fixes: a8b659e7ff75 ("net: dsa: act as passthrough for bridge port flags") Signed-off-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 19 +++++++++++-------- drivers/net/dsa/b53/b53_priv.h | 8 ++++++++ drivers/net/dsa/bcm_sf2.c | 3 +++ 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index ae86ded1e2a1..fceca3f5b6a5 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1953,19 +1953,20 @@ void b53_br_fast_age(struct dsa_switch *ds, int port) } EXPORT_SYMBOL(b53_br_fast_age); -static int b53_br_flags_pre(struct dsa_switch *ds, int port, - struct switchdev_brport_flags flags, - struct netlink_ext_ack *extack) +int b53_br_flags_pre(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { if (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD)) return -EINVAL; return 0; } +EXPORT_SYMBOL(b53_br_flags_pre); -static int b53_br_flags(struct dsa_switch *ds, int port, - struct switchdev_brport_flags flags, - struct netlink_ext_ack *extack) +int b53_br_flags(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { if (flags.mask & BR_FLOOD) b53_port_set_ucast_flood(ds->priv, port, @@ -1976,14 +1977,16 @@ static int b53_br_flags(struct dsa_switch *ds, int port, return 0; } +EXPORT_SYMBOL(b53_br_flags); -static int b53_set_mrouter(struct dsa_switch *ds, int port, bool mrouter, - struct netlink_ext_ack *extack) +int b53_set_mrouter(struct dsa_switch *ds, int port, bool mrouter, + struct netlink_ext_ack *extack) { b53_port_set_mcast_flood(ds->priv, port, mrouter); return 0; } +EXPORT_SYMBOL(b53_set_mrouter); static bool b53_possible_cpu_port(struct dsa_switch *ds, int port) { diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index faf983fbca82..8419bb7f4505 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -326,6 +326,14 @@ int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge); void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge); void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state); void b53_br_fast_age(struct dsa_switch *ds, int port); +int b53_br_flags_pre(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); +int b53_br_flags(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); +int b53_set_mrouter(struct dsa_switch *ds, int port, bool mrouter, + struct netlink_ext_ack *extack); int b53_setup_devlink_resources(struct dsa_switch *ds); void b53_port_event(struct dsa_switch *ds, int port); void b53_phylink_validate(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 1857aa9aa84a..3eaedbb12815 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1117,7 +1117,10 @@ static const struct dsa_switch_ops bcm_sf2_ops = { .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, .port_bridge_leave = b53_br_leave, + .port_pre_bridge_flags = b53_br_flags_pre, + .port_bridge_flags = b53_br_flags, .port_stp_state_set = b53_br_set_stp_state, + .port_set_mrouter = b53_set_mrouter, .port_fast_age = b53_br_fast_age, .port_vlan_filtering = b53_vlan_filtering, .port_vlan_add = b53_vlan_add, From f9b3827ee66cfcf297d0acd6ecf33653a5f297ef Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 22 Feb 2021 14:30:10 -0800 Subject: [PATCH 30/55] net: dsa: b53: Support setting learning on port Add support for being able to set the learning attribute on port, and make sure that the standalone ports start up with learning disabled. We can remove the code in bcm_sf2 that configured the ports learning attribute because we want the standalone ports to have learning disabled by default and port 7 cannot be bridged, so its learning attribute will not change past its initial configuration. Signed-off-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 20 +++++++++++++++++++- drivers/net/dsa/b53/b53_regs.h | 1 + drivers/net/dsa/bcm_sf2.c | 15 +-------------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index fceca3f5b6a5..a162499bcafc 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -543,6 +543,19 @@ static void b53_port_set_mcast_flood(struct b53_device *dev, int port, b53_write16(dev, B53_CTRL_PAGE, B53_IPMC_FLOOD_MASK, mc); } +static void b53_port_set_learning(struct b53_device *dev, int port, + bool learning) +{ + u16 reg; + + b53_read16(dev, B53_CTRL_PAGE, B53_DIS_LEARNING, ®); + if (learning) + reg &= ~BIT(port); + else + reg |= BIT(port); + b53_write16(dev, B53_CTRL_PAGE, B53_DIS_LEARNING, reg); +} + int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { struct b53_device *dev = ds->priv; @@ -557,6 +570,7 @@ int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) b53_port_set_ucast_flood(dev, port, true); b53_port_set_mcast_flood(dev, port, true); + b53_port_set_learning(dev, port, false); if (dev->ops->irq_enable) ret = dev->ops->irq_enable(dev, port); @@ -691,6 +705,7 @@ static void b53_enable_cpu_port(struct b53_device *dev, int port) b53_port_set_ucast_flood(dev, port, true); b53_port_set_mcast_flood(dev, port, true); + b53_port_set_learning(dev, port, false); } static void b53_enable_mib(struct b53_device *dev) @@ -1957,7 +1972,7 @@ int b53_br_flags_pre(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack) { - if (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD)) + if (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD | BR_LEARNING)) return -EINVAL; return 0; @@ -1974,6 +1989,9 @@ int b53_br_flags(struct dsa_switch *ds, int port, if (flags.mask & BR_MCAST_FLOOD) b53_port_set_mcast_flood(ds->priv, port, !!(flags.val & BR_MCAST_FLOOD)); + if (flags.mask & BR_LEARNING) + b53_port_set_learning(ds->priv, port, + !!(flags.val & BR_LEARNING)); return 0; } diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index c90985c294a2..b2c539a42154 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -115,6 +115,7 @@ #define B53_UC_FLOOD_MASK 0x32 #define B53_MC_FLOOD_MASK 0x34 #define B53_IPMC_FLOOD_MASK 0x36 +#define B53_DIS_LEARNING 0x3c /* * Override Ports 0-7 State on devices with xMII interfaces (8 bit) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 3eaedbb12815..5ee8103b8e9c 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -223,23 +223,10 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, reg &= ~P_TXQ_PSM_VDD(port); core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL); - /* Enable learning */ - reg = core_readl(priv, CORE_DIS_LEARN); - reg &= ~BIT(port); - core_writel(priv, reg, CORE_DIS_LEARN); - /* Enable Broadcom tags for that port if requested */ - if (priv->brcm_tag_mask & BIT(port)) { + if (priv->brcm_tag_mask & BIT(port)) b53_brcm_hdr_setup(ds, port); - /* Disable learning on ASP port */ - if (port == 7) { - reg = core_readl(priv, CORE_DIS_LEARN); - reg |= BIT(port); - core_writel(priv, reg, CORE_DIS_LEARN); - } - } - /* Configure Traffic Class to QoS mapping, allow each priority to map * to a different queue number */ From 3aed8b63336c3f81a4fd72808dcf6197fabbbdb2 Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 23 Feb 2021 15:11:55 +0800 Subject: [PATCH 31/55] net/sched: cls_flower: validate ct_state for invalid and reply flags Add invalid and reply flags validate in the fl_validate_ct_state. This makes the checking complete if compared to ovs' validate_ct_state(). Signed-off-by: wenxu Reviewed-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/1614064315-364-1-git-send-email-wenxu@ucloud.cn Signed-off-by: Jakub Kicinski --- net/sched/cls_flower.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2409e522c68f..d097b5c15faa 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1417,6 +1417,21 @@ static int fl_validate_ct_state(u16 state, struct nlattr *tb, return -EINVAL; } + if (state & TCA_FLOWER_KEY_CT_FLAGS_INVALID && + state & ~(TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID)) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "when inv is set, only trk may be set"); + return -EINVAL; + } + + if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW && + state & TCA_FLOWER_KEY_CT_FLAGS_REPLY) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "new and rpl are mutually exclusive"); + return -EINVAL; + } + return 0; } From 7a0ae61acde2cebd69665837170405eced86a6c7 Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 19 Feb 2021 17:04:40 +0800 Subject: [PATCH 32/55] r8152: enable U1/U2 for USB_SPEED_SUPER U1/U2 shoued be enabled for USB 3.0 or later. The USB 2.0 doesn't support it. Signed-off-by: Hayes Wang Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 2d7cc63bef89..4bfee289aa6f 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3360,7 +3360,7 @@ static void rtl8153b_runtime_enable(struct r8152 *tp, bool enable) r8153b_ups_en(tp, false); r8153_queue_wake(tp, false); rtl_runtime_suspend_enable(tp, false); - if (tp->udev->speed != USB_SPEED_HIGH) + if (tp->udev->speed >= USB_SPEED_SUPER) r8153b_u1u2en(tp, true); } } @@ -5056,7 +5056,7 @@ static void rtl8153b_up(struct r8152 *tp) r8153_aldps_en(tp, true); - if (tp->udev->speed != USB_SPEED_HIGH) + if (tp->udev->speed >= USB_SPEED_SUPER) r8153b_u1u2en(tp, true); } @@ -5572,8 +5572,9 @@ static void r8153b_init(struct r8152 *tp) ocp_data |= POLL_LINK_CHG; ocp_write_word(tp, MCU_TYPE_PLA, PLA_EXTRA_STATUS, ocp_data); - if (tp->udev->speed != USB_SPEED_HIGH) + if (tp->udev->speed >= USB_SPEED_SUPER) r8153b_u1u2en(tp, true); + usb_enable_lpm(tp->udev); /* MAC clock speed down */ From c79515e47935c747282c6ed2ee5b2ef039756eeb Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 19 Feb 2021 17:04:41 +0800 Subject: [PATCH 33/55] r8152: check if the pointer of the function exists Return error code if autosuspend_en, eee_get, or eee_set don't exist. Signed-off-by: Hayes Wang Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 4bfee289aa6f..baa63ea2590a 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -5757,6 +5757,9 @@ static int rtl8152_runtime_suspend(struct r8152 *tp) struct net_device *netdev = tp->netdev; int ret = 0; + if (!tp->rtl_ops.autosuspend_en) + return -EBUSY; + set_bit(SELECTIVE_SUSPEND, &tp->flags); smp_mb__after_atomic(); @@ -6156,6 +6159,11 @@ rtl_ethtool_get_eee(struct net_device *net, struct ethtool_eee *edata) struct r8152 *tp = netdev_priv(net); int ret; + if (!tp->rtl_ops.eee_get) { + ret = -EOPNOTSUPP; + goto out; + } + ret = usb_autopm_get_interface(tp->intf); if (ret < 0) goto out; @@ -6178,6 +6186,11 @@ rtl_ethtool_set_eee(struct net_device *net, struct ethtool_eee *edata) struct r8152 *tp = netdev_priv(net); int ret; + if (!tp->rtl_ops.eee_set) { + ret = -EOPNOTSUPP; + goto out; + } + ret = usb_autopm_get_interface(tp->intf); if (ret < 0) goto out; From 156c3207611262266f0eea589ac3f00c5657320e Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 19 Feb 2021 17:04:42 +0800 Subject: [PATCH 34/55] r8152: replace netif_err with dev_err Some messages are before calling register_netdev(), so replace netif_err() with dev_err(). Signed-off-by: Hayes Wang Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index baa63ea2590a..82a129264e31 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -6590,7 +6590,7 @@ static int rtl_ops_init(struct r8152 *tp) default: ret = -ENODEV; - netif_err(tp, probe, tp->netdev, "Unknown Device\n"); + dev_err(&tp->intf->dev, "Unknown Device\n"); break; } @@ -6847,7 +6847,7 @@ static int rtl8152_probe(struct usb_interface *intf, ret = register_netdev(netdev); if (ret != 0) { - netif_err(tp, probe, netdev, "couldn't register the device\n"); + dev_err(&intf->dev, "couldn't register the device\n"); goto out1; } From 40fa7568ac230446d888b7ad402cff9e20fe3ad5 Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 19 Feb 2021 17:04:43 +0800 Subject: [PATCH 35/55] r8152: spilt rtl_set_eee_plus and r8153b_green_en Add rtl_eee_plus_en() and rtl_green_en(). Signed-off-by: Hayes Wang Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 43 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 82a129264e31..b246817f3405 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2632,21 +2632,24 @@ static inline u8 rtl8152_get_speed(struct r8152 *tp) return ocp_read_byte(tp, MCU_TYPE_PLA, PLA_PHYSTATUS); } -static void rtl_set_eee_plus(struct r8152 *tp) +static void rtl_eee_plus_en(struct r8152 *tp, bool enable) { u32 ocp_data; - u8 speed; - speed = rtl8152_get_speed(tp); - if (speed & _10bps) { - ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR); + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR); + if (enable) ocp_data |= EEEP_CR_EEEP_TX; - ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR, ocp_data); - } else { - ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR); + else ocp_data &= ~EEEP_CR_EEEP_TX; - ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR, ocp_data); - } + ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR, ocp_data); +} + +static void rtl_set_eee_plus(struct r8152 *tp) +{ + if (rtl8152_get_speed(tp) & _10bps) + rtl_eee_plus_en(tp, true); + else + rtl_eee_plus_en(tp, false); } static void rxdy_gated_en(struct r8152 *tp, bool enable) @@ -3150,10 +3153,22 @@ static void r8153b_ups_flags(struct r8152 *tp) ocp_write_dword(tp, MCU_TYPE_USB, USB_UPS_FLAGS, ups_flags); } -static void r8153b_green_en(struct r8152 *tp, bool enable) +static void rtl_green_en(struct r8152 *tp, bool enable) { u16 data; + data = sram_read(tp, SRAM_GREEN_CFG); + if (enable) + data |= GREEN_ETH_EN; + else + data &= ~GREEN_ETH_EN; + sram_write(tp, SRAM_GREEN_CFG, data); + + tp->ups_info.green = enable; +} + +static void r8153b_green_en(struct r8152 *tp, bool enable) +{ if (enable) { sram_write(tp, 0x8045, 0); /* 10M abiq&ldvbias */ sram_write(tp, 0x804d, 0x1222); /* 100M short abiq&ldvbias */ @@ -3164,11 +3179,7 @@ static void r8153b_green_en(struct r8152 *tp, bool enable) sram_write(tp, 0x805d, 0x2444); /* 1000M short abiq&ldvbias */ } - data = sram_read(tp, SRAM_GREEN_CFG); - data |= GREEN_ETH_EN; - sram_write(tp, SRAM_GREEN_CFG, data); - - tp->ups_info.green = enable; + rtl_green_en(tp, true); } static u16 r8153_phy_status(struct r8152 *tp, u16 desired) From 92584ddf550ae72d492858c19d1f9025e07a9350 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 21 Feb 2021 15:45:52 +0000 Subject: [PATCH 36/55] vxlan: move debug check after netdev unregister The debug check must be done after unregister_netdevice_many() call -- the hlist_del_rcu() for this is done inside .ndo_stop. This is the same with commit 0fda7600c2e1 ("geneve: move debug check after netdev unregister") Test commands: ip netns del A ip netns add A ip netns add B ip netns exec B ip link add vxlan0 type vxlan vni 100 local 10.0.0.1 \ remote 10.0.0.2 dstport 4789 srcport 4789 4789 ip netns exec B ip link set vxlan0 netns A ip netns exec A ip link set vxlan0 up ip netns del B Splat looks like: [ 73.176249][ T7] ------------[ cut here ]------------ [ 73.178662][ T7] WARNING: CPU: 4 PID: 7 at drivers/net/vxlan.c:4743 vxlan_exit_batch_net+0x52e/0x720 [vxlan] [ 73.182597][ T7] Modules linked in: vxlan openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 mlx5_core nfp mlxfw ixgbevf tls sch_fq_codel nf_tables nfnetlink ip_tables x_tables unix [ 73.190113][ T7] CPU: 4 PID: 7 Comm: kworker/u16:0 Not tainted 5.11.0-rc7+ #838 [ 73.193037][ T7] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 73.196986][ T7] Workqueue: netns cleanup_net [ 73.198946][ T7] RIP: 0010:vxlan_exit_batch_net+0x52e/0x720 [vxlan] [ 73.201509][ T7] Code: 00 01 00 00 0f 84 39 fd ff ff 48 89 ca 48 c1 ea 03 80 3c 1a 00 0f 85 a6 00 00 00 89 c2 48 83 c2 02 49 8b 14 d4 48 85 d2 74 ce <0f> 0b eb ca e8 b9 51 db dd 84 c0 0f 85 4a fe ff ff 48 c7 c2 80 bc [ 73.208813][ T7] RSP: 0018:ffff888100907c10 EFLAGS: 00010286 [ 73.211027][ T7] RAX: 000000000000003c RBX: dffffc0000000000 RCX: ffff88800ec411f0 [ 73.213702][ T7] RDX: ffff88800a278000 RSI: ffff88800fc78c70 RDI: ffff88800fc78070 [ 73.216169][ T7] RBP: ffff88800b5cbdc0 R08: fffffbfff424de61 R09: fffffbfff424de61 [ 73.218463][ T7] R10: ffffffffa126f307 R11: fffffbfff424de60 R12: ffff88800ec41000 [ 73.220794][ T7] R13: ffff888100907d08 R14: ffff888100907c50 R15: ffff88800fc78c40 [ 73.223337][ T7] FS: 0000000000000000(0000) GS:ffff888114800000(0000) knlGS:0000000000000000 [ 73.225814][ T7] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 73.227616][ T7] CR2: 0000562b5cb4f4d0 CR3: 0000000105fbe001 CR4: 00000000003706e0 [ 73.229700][ T7] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 73.231820][ T7] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 73.233844][ T7] Call Trace: [ 73.234698][ T7] ? vxlan_err_lookup+0x3c0/0x3c0 [vxlan] [ 73.235962][ T7] ? ops_exit_list.isra.11+0x93/0x140 [ 73.237134][ T7] cleanup_net+0x45e/0x8a0 [ ... ] Fixes: 57b61127ab7d ("vxlan: speedup vxlan tunnels dismantle") Signed-off-by: Taehee Yoo Link: https://lore.kernel.org/r/20210221154552.11749-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3929e437382b..666dd201c3d5 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -4721,7 +4721,6 @@ static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan, *next; struct net_device *dev, *aux; - unsigned int h; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &vxlan_link_ops) @@ -4735,14 +4734,13 @@ static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) unregister_netdevice_queue(vxlan->dev, head); } - for (h = 0; h < PORT_HASH_SIZE; ++h) - WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); } static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) { struct net *net; LIST_HEAD(list); + unsigned int h; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { @@ -4755,6 +4753,13 @@ static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) unregister_netdevice_many(&list); rtnl_unlock(); + + list_for_each_entry(net, net_list, exit_list) { + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + + for (h = 0; h < PORT_HASH_SIZE; ++h) + WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); + } } static struct pernet_operations vxlan_net_ops = { From fc0494ead6398609c49afa37bc949b61c5c16b91 Mon Sep 17 00:00:00 2001 From: Takeshi Misawa Date: Mon, 22 Feb 2021 08:44:27 +0900 Subject: [PATCH 37/55] net: qrtr: Fix memory leak in qrtr_tun_open If qrtr_endpoint_register() failed, tun is leaked. Fix this, by freeing tun in error path. syzbot report: BUG: memory leak unreferenced object 0xffff88811848d680 (size 64): comm "syz-executor684", pid 10171, jiffies 4294951561 (age 26.070s) hex dump (first 32 bytes): 80 dd 0a 84 ff ff ff ff 00 00 00 00 00 00 00 00 ................ 90 d6 48 18 81 88 ff ff 90 d6 48 18 81 88 ff ff ..H.......H..... backtrace: [<0000000018992a50>] kmalloc include/linux/slab.h:552 [inline] [<0000000018992a50>] kzalloc include/linux/slab.h:682 [inline] [<0000000018992a50>] qrtr_tun_open+0x22/0x90 net/qrtr/tun.c:35 [<0000000003a453ef>] misc_open+0x19c/0x1e0 drivers/char/misc.c:141 [<00000000dec38ac8>] chrdev_open+0x10d/0x340 fs/char_dev.c:414 [<0000000079094996>] do_dentry_open+0x1e6/0x620 fs/open.c:817 [<000000004096d290>] do_open fs/namei.c:3252 [inline] [<000000004096d290>] path_openat+0x74a/0x1b00 fs/namei.c:3369 [<00000000b8e64241>] do_filp_open+0xa0/0x190 fs/namei.c:3396 [<00000000a3299422>] do_sys_openat2+0xed/0x230 fs/open.c:1172 [<000000002c1bdcef>] do_sys_open fs/open.c:1188 [inline] [<000000002c1bdcef>] __do_sys_openat fs/open.c:1204 [inline] [<000000002c1bdcef>] __se_sys_openat fs/open.c:1199 [inline] [<000000002c1bdcef>] __x64_sys_openat+0x7f/0xe0 fs/open.c:1199 [<00000000f3a5728f>] do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 [<000000004b38b7ec>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 28fb4e59a47d ("net: qrtr: Expose tunneling endpoint to user space") Reported-by: syzbot+5d6e4af21385f5cfc56a@syzkaller.appspotmail.com Signed-off-by: Takeshi Misawa Link: https://lore.kernel.org/r/20210221234427.GA2140@DESKTOP Signed-off-by: Jakub Kicinski --- net/qrtr/tun.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index b238c40a9984..304b41fea5ab 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -31,6 +31,7 @@ static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb) static int qrtr_tun_open(struct inode *inode, struct file *filp) { struct qrtr_tun *tun; + int ret; tun = kzalloc(sizeof(*tun), GFP_KERNEL); if (!tun) @@ -43,7 +44,16 @@ static int qrtr_tun_open(struct inode *inode, struct file *filp) filp->private_data = tun; - return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO); + ret = qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO); + if (ret) + goto out; + + return 0; + +out: + filp->private_data = NULL; + kfree(tun); + return ret; } static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to) From 30ac4e2f54ec067b7b9ca0db27e75681581378d6 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Mon, 22 Feb 2021 17:25:43 +0100 Subject: [PATCH 38/55] wireguard: avoid double unlikely() notation when using IS_ERR() The definition of IS_ERR() already applies the unlikely() notation when checking the error status of the passed pointer. For this reason there is no need to have the same notation outside of IS_ERR() itself. Clean up code by removing redundant notation. Signed-off-by: Antonio Quartulli Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 2 +- drivers/net/wireguard/socket.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index a3ed49cd95c3..cd51a2afa28e 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -157,7 +157,7 @@ static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) } else { struct sk_buff *segs = skb_gso_segment(skb, 0); - if (unlikely(IS_ERR(segs))) { + if (IS_ERR(segs)) { ret = PTR_ERR(segs); goto err_peer; } diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 410b318e57fb..41430c0e465a 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -71,7 +71,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, ip_rt_put(rt); rt = ip_route_output_flow(sock_net(sock), &fl, sock); } - if (unlikely(IS_ERR(rt))) { + if (IS_ERR(rt)) { ret = PTR_ERR(rt); net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); @@ -138,7 +138,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, } dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sock), sock, &fl, NULL); - if (unlikely(IS_ERR(dst))) { + if (IS_ERR(dst)) { ret = PTR_ERR(dst); net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); From 7f57bd8dc22de35ddd895294aa554003e4f19a72 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 22 Feb 2021 17:25:44 +0100 Subject: [PATCH 39/55] wireguard: socket: remove bogus __be32 annotation The endpoint->src_if4 has nothing to do with fixed-endian numbers; remove the bogus annotation. This was introduced in https://git.zx2c4.com/wireguard-monolithic-historical/commit?id=14e7d0a499a676ec55176c0de2f9fcbd34074a82 in the historical WireGuard repo because the old code used to zero-initialize multiple members as follows: endpoint->src4.s_addr = endpoint->src_if4 = fl.saddr = 0; Because fl.saddr is fixed-endian and an assignment returns a value with the type of its left operand, this meant that sparse detected an assignment between values of different endianness. Since then, this assignment was already split up into separate statements; just the cast survived. Signed-off-by: Jann Horn Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 41430c0e465a..d9ad850daa79 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -53,7 +53,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, if (unlikely(!inet_confirm_addr(sock_net(sock), NULL, 0, fl.saddr, RT_SCOPE_HOST))) { endpoint->src4.s_addr = 0; - *(__force __be32 *)&endpoint->src_if4 = 0; + endpoint->src_if4 = 0; fl.saddr = 0; if (cache) dst_cache_reset(cache); @@ -63,7 +63,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, PTR_ERR(rt) == -EINVAL) || (!IS_ERR(rt) && rt->dst.dev->ifindex != endpoint->src_if4)))) { endpoint->src4.s_addr = 0; - *(__force __be32 *)&endpoint->src_if4 = 0; + endpoint->src_if4 = 0; fl.saddr = 0; if (cache) dst_cache_reset(cache); From d5a49aa6c3e264a93a7d08485d66e346be0969dd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:45 +0100 Subject: [PATCH 40/55] wireguard: selftests: test multiple parallel streams In order to test ndo_start_xmit being called in parallel, explicitly add separate tests, which should all run on different cores. This should help tease out bugs associated with queueing up packets from different cores in parallel. Currently, it hasn't found those types of bugs, but given future planned work, this is a useful regression to avoid. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/netns.sh | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 74c69b75f6f5..7ed7cd95e58f 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -39,7 +39,7 @@ ip0() { pretty 0 "ip $*"; ip -n $netns0 "$@"; } ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; } ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; } sleep() { read -t "$1" -N 1 || true; } -waitiperf() { pretty "${1//*-}" "wait for iperf:5201 pid $2"; while [[ $(ss -N "$1" -tlpH 'sport = 5201') != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; } +waitiperf() { pretty "${1//*-}" "wait for iperf:${3:-5201} pid $2"; while [[ $(ss -N "$1" -tlpH "sport = ${3:-5201}") != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; } waitncatudp() { pretty "${1//*-}" "wait for udp:1111 pid $2"; while [[ $(ss -N "$1" -ulpH 'sport = 1111') != *\"ncat\",pid=$2,fd=* ]]; do sleep 0.1; done; } waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; } @@ -141,6 +141,19 @@ tests() { n2 iperf3 -s -1 -B fd00::2 & waitiperf $netns2 $! n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2 + + # TCP over IPv4, in parallel + for max in 4 5 50; do + local pids=( ) + for ((i=0; i < max; ++i)) do + n2 iperf3 -p $(( 5200 + i )) -s -1 -B 192.168.241.2 & + pids+=( $! ); waitiperf $netns2 $! $(( 5200 + i )) + done + for ((i=0; i < max; ++i)) do + n1 iperf3 -Z -t 3 -p $(( 5200 + i )) -c 192.168.241.2 & + done + wait "${pids[@]}" + done } [[ $(ip1 link show dev wg0) =~ mtu\ ([0-9]+) ]] && orig_mtu="${BASH_REMATCH[1]}" From 5a0598695634a6bb4126818902dd9140cd9df8b6 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:46 +0100 Subject: [PATCH 41/55] wireguard: peer: put frequently used members above cache lines The is_dead boolean is checked for every single packet, while the internal_id member is used basically only for pr_debug messages. So it makes sense to hoist up is_dead into some space formerly unused by a struct hole, while demoting internal_api to below the lowest struct cache line. Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/peer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/peer.h b/drivers/net/wireguard/peer.h index 23af40922997..aaff8de6e34b 100644 --- a/drivers/net/wireguard/peer.h +++ b/drivers/net/wireguard/peer.h @@ -39,6 +39,7 @@ struct wg_peer { struct crypt_queue tx_queue, rx_queue; struct sk_buff_head staged_packet_queue; int serial_work_cpu; + bool is_dead; struct noise_keypairs keypairs; struct endpoint endpoint; struct dst_cache endpoint_cache; @@ -61,9 +62,8 @@ struct wg_peer { struct rcu_head rcu; struct list_head peer_list; struct list_head allowedips_list; - u64 internal_id; struct napi_struct napi; - bool is_dead; + u64 internal_id; }; struct wg_peer *wg_peer_create(struct wg_device *wg, From 99fff5264e7ab06f45b0ad60243475be0a8d0559 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:47 +0100 Subject: [PATCH 42/55] wireguard: device: do not generate ICMP for non-IP packets If skb->protocol doesn't match the actual skb->data header, it's probably not a good idea to pass it off to icmp{,v6}_ndo_send, which is expecting to reply to a valid IP packet. So this commit has that early mismatch case jump to a later error label. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index cd51a2afa28e..8502e1b083ff 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -138,7 +138,7 @@ static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) else if (skb->protocol == htons(ETH_P_IPV6)) net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI6\n", dev->name, &ipv6_hdr(skb)->daddr); - goto err; + goto err_icmp; } family = READ_ONCE(peer->endpoint.addr.sa_family); @@ -201,12 +201,13 @@ static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) err_peer: wg_peer_put(peer); -err: - ++dev->stats.tx_errors; +err_icmp: if (skb->protocol == htons(ETH_P_IP)) icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); else if (skb->protocol == htons(ETH_P_IPV6)) icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); +err: + ++dev->stats.tx_errors; kfree_skb(skb); return ret; } From 8b5553ace83cced775eefd0f3f18b5c6214ccf7a Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:48 +0100 Subject: [PATCH 43/55] wireguard: queueing: get rid of per-peer ring buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Having two ring buffers per-peer means that every peer results in two massive ring allocations. On an 8-core x86_64 machine, this commit reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which is an 90% reduction. Ninety percent! With some single-machine deployments approaching 500,000 peers, we're talking about a reduction from 7 gigs of memory down to 700 megs of memory. In order to get rid of these per-peer allocations, this commit switches to using a list-based queueing approach. Currently GSO fragments are chained together using the skb->next pointer (the skb_list_* singly linked list approach), so we form the per-peer queue around the unused skb->prev pointer (which sort of makes sense because the links are pointing backwards). Use of skb_queue_* is not possible here, because that is based on doubly linked lists and spinlocks. Multiple cores can write into the queue at any given time, because its writes occur in the start_xmit path or in the udp_recv path. But reads happen in a single workqueue item per-peer, amounting to a multi-producer, single-consumer paradigm. The MPSC queue is implemented locklessly and never blocks. However, it is not linearizable (though it is serializable), with a very tight and unlikely race on writes, which, when hit (some tiny fraction of the 0.15% of partial adds on a fully loaded 16-core x86_64 system), causes the queue reader to terminate early. However, because every packet sent queues up the same workqueue item after it is fully added, the worker resumes again, and stopping early isn't actually a problem, since at that point the packet wouldn't have yet been added to the encryption queue. These properties allow us to avoid disabling interrupts or spinning. The design is based on Dmitry Vyukov's algorithm [1]. Performance-wise, ordinarily list-based queues aren't preferable to ringbuffers, because of cache misses when following pointers around. However, we *already* have to follow the adjacent pointers when working through fragments, so there shouldn't actually be any change there. A potential downside is that dequeueing is a bit more complicated, but the ptr_ring structure used prior had a spinlock when dequeueing, so all and all the difference appears to be a wash. Actually, from profiling, the biggest performance hit, by far, of this commit winds up being atomic_add_unless(count, 1, max) and atomic_ dec(count), which account for the majority of CPU time, according to perf. In that sense, the previous ring buffer was superior in that it could check if it was full by head==tail, which the list-based approach cannot do. But all and all, this enables us to get massive memory savings, allowing WireGuard to scale for real world deployments, without taking much of a performance hit. [1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue Reviewed-by: Dmitry Vyukov Reviewed-by: Toke Høiland-Jørgensen Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 12 ++--- drivers/net/wireguard/device.h | 15 +++--- drivers/net/wireguard/peer.c | 28 ++++------- drivers/net/wireguard/peer.h | 4 +- drivers/net/wireguard/queueing.c | 86 +++++++++++++++++++++++++------- drivers/net/wireguard/queueing.h | 45 ++++++++++++----- drivers/net/wireguard/receive.c | 16 +++--- drivers/net/wireguard/send.c | 31 ++++-------- 8 files changed, 144 insertions(+), 93 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 8502e1b083ff..551ddaaaf540 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -235,8 +235,8 @@ static void wg_destruct(struct net_device *dev) destroy_workqueue(wg->handshake_receive_wq); destroy_workqueue(wg->handshake_send_wq); destroy_workqueue(wg->packet_crypt_wq); - wg_packet_queue_free(&wg->decrypt_queue, true); - wg_packet_queue_free(&wg->encrypt_queue, true); + wg_packet_queue_free(&wg->decrypt_queue); + wg_packet_queue_free(&wg->encrypt_queue); rcu_barrier(); /* Wait for all the peers to be actually freed. */ wg_ratelimiter_uninit(); memzero_explicit(&wg->static_identity, sizeof(wg->static_identity)); @@ -338,12 +338,12 @@ static int wg_newlink(struct net *src_net, struct net_device *dev, goto err_destroy_handshake_send; ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker, - true, MAX_QUEUED_PACKETS); + MAX_QUEUED_PACKETS); if (ret < 0) goto err_destroy_packet_crypt; ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker, - true, MAX_QUEUED_PACKETS); + MAX_QUEUED_PACKETS); if (ret < 0) goto err_free_encrypt_queue; @@ -368,9 +368,9 @@ static int wg_newlink(struct net *src_net, struct net_device *dev, err_uninit_ratelimiter: wg_ratelimiter_uninit(); err_free_decrypt_queue: - wg_packet_queue_free(&wg->decrypt_queue, true); + wg_packet_queue_free(&wg->decrypt_queue); err_free_encrypt_queue: - wg_packet_queue_free(&wg->encrypt_queue, true); + wg_packet_queue_free(&wg->encrypt_queue); err_destroy_packet_crypt: destroy_workqueue(wg->packet_crypt_wq); err_destroy_handshake_send: diff --git a/drivers/net/wireguard/device.h b/drivers/net/wireguard/device.h index 4d0144e16947..854bc3d97150 100644 --- a/drivers/net/wireguard/device.h +++ b/drivers/net/wireguard/device.h @@ -27,13 +27,14 @@ struct multicore_worker { struct crypt_queue { struct ptr_ring ring; - union { - struct { - struct multicore_worker __percpu *worker; - int last_cpu; - }; - struct work_struct work; - }; + struct multicore_worker __percpu *worker; + int last_cpu; +}; + +struct prev_queue { + struct sk_buff *head, *tail, *peeked; + struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff. + atomic_t count; }; struct wg_device { diff --git a/drivers/net/wireguard/peer.c b/drivers/net/wireguard/peer.c index b3b6370e6b95..cd5cb0292cb6 100644 --- a/drivers/net/wireguard/peer.c +++ b/drivers/net/wireguard/peer.c @@ -32,27 +32,22 @@ struct wg_peer *wg_peer_create(struct wg_device *wg, peer = kzalloc(sizeof(*peer), GFP_KERNEL); if (unlikely(!peer)) return ERR_PTR(ret); - peer->device = wg; + if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) + goto err; + peer->device = wg; wg_noise_handshake_init(&peer->handshake, &wg->static_identity, public_key, preshared_key, peer); - if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) - goto err_1; - if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false, - MAX_QUEUED_PACKETS)) - goto err_2; - if (wg_packet_queue_init(&peer->rx_queue, NULL, false, - MAX_QUEUED_PACKETS)) - goto err_3; - peer->internal_id = atomic64_inc_return(&peer_counter); peer->serial_work_cpu = nr_cpumask_bits; wg_cookie_init(&peer->latest_cookie); wg_timers_init(peer); wg_cookie_checker_precompute_peer_keys(peer); spin_lock_init(&peer->keypairs.keypair_update_lock); - INIT_WORK(&peer->transmit_handshake_work, - wg_packet_handshake_send_worker); + INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); + INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); + wg_prev_queue_init(&peer->tx_queue); + wg_prev_queue_init(&peer->rx_queue); rwlock_init(&peer->endpoint_lock); kref_init(&peer->refcount); skb_queue_head_init(&peer->staged_packet_queue); @@ -68,11 +63,7 @@ struct wg_peer *wg_peer_create(struct wg_device *wg, pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); return peer; -err_3: - wg_packet_queue_free(&peer->tx_queue, false); -err_2: - dst_cache_destroy(&peer->endpoint_cache); -err_1: +err: kfree(peer); return ERR_PTR(ret); } @@ -197,8 +188,7 @@ static void rcu_release(struct rcu_head *rcu) struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); dst_cache_destroy(&peer->endpoint_cache); - wg_packet_queue_free(&peer->rx_queue, false); - wg_packet_queue_free(&peer->tx_queue, false); + WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); /* The final zeroing takes care of clearing any remaining handshake key * material and other potentially sensitive information. diff --git a/drivers/net/wireguard/peer.h b/drivers/net/wireguard/peer.h index aaff8de6e34b..8d53b687a1d1 100644 --- a/drivers/net/wireguard/peer.h +++ b/drivers/net/wireguard/peer.h @@ -36,7 +36,7 @@ struct endpoint { struct wg_peer { struct wg_device *device; - struct crypt_queue tx_queue, rx_queue; + struct prev_queue tx_queue, rx_queue; struct sk_buff_head staged_packet_queue; int serial_work_cpu; bool is_dead; @@ -46,7 +46,7 @@ struct wg_peer { rwlock_t endpoint_lock; struct noise_handshake handshake; atomic64_t last_sent_handshake; - struct work_struct transmit_handshake_work, clear_peer_work; + struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work; struct cookie latest_cookie; struct hlist_node pubkey_hash; u64 rx_bytes, tx_bytes; diff --git a/drivers/net/wireguard/queueing.c b/drivers/net/wireguard/queueing.c index 71b8e80b58e1..48e7b982a307 100644 --- a/drivers/net/wireguard/queueing.c +++ b/drivers/net/wireguard/queueing.c @@ -9,8 +9,7 @@ struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) { int cpu; - struct multicore_worker __percpu *worker = - alloc_percpu(struct multicore_worker); + struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker); if (!worker) return NULL; @@ -23,7 +22,7 @@ wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) } int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, - bool multicore, unsigned int len) + unsigned int len) { int ret; @@ -31,25 +30,78 @@ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL); if (ret) return ret; - if (function) { - if (multicore) { - queue->worker = wg_packet_percpu_multicore_worker_alloc( - function, queue); - if (!queue->worker) { - ptr_ring_cleanup(&queue->ring, NULL); - return -ENOMEM; - } - } else { - INIT_WORK(&queue->work, function); - } + queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue); + if (!queue->worker) { + ptr_ring_cleanup(&queue->ring, NULL); + return -ENOMEM; } return 0; } -void wg_packet_queue_free(struct crypt_queue *queue, bool multicore) +void wg_packet_queue_free(struct crypt_queue *queue) { - if (multicore) - free_percpu(queue->worker); + free_percpu(queue->worker); WARN_ON(!__ptr_ring_empty(&queue->ring)); ptr_ring_cleanup(&queue->ring, NULL); } + +#define NEXT(skb) ((skb)->prev) +#define STUB(queue) ((struct sk_buff *)&queue->empty) + +void wg_prev_queue_init(struct prev_queue *queue) +{ + NEXT(STUB(queue)) = NULL; + queue->head = queue->tail = STUB(queue); + queue->peeked = NULL; + atomic_set(&queue->count, 0); + BUILD_BUG_ON( + offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) - + offsetof(struct prev_queue, empty) || + offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) - + offsetof(struct prev_queue, empty)); +} + +static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) +{ + WRITE_ONCE(NEXT(skb), NULL); + WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb); +} + +bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) +{ + if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS)) + return false; + __wg_prev_queue_enqueue(queue, skb); + return true; +} + +struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue) +{ + struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail)); + + if (tail == STUB(queue)) { + if (!next) + return NULL; + queue->tail = next; + tail = next; + next = smp_load_acquire(&NEXT(next)); + } + if (next) { + queue->tail = next; + atomic_dec(&queue->count); + return tail; + } + if (tail != READ_ONCE(queue->head)) + return NULL; + __wg_prev_queue_enqueue(queue, STUB(queue)); + next = smp_load_acquire(&NEXT(tail)); + if (next) { + queue->tail = next; + atomic_dec(&queue->count); + return tail; + } + return NULL; +} + +#undef NEXT +#undef STUB diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index dfb674e03076..4ef2944a68bc 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -17,12 +17,13 @@ struct wg_device; struct wg_peer; struct multicore_worker; struct crypt_queue; +struct prev_queue; struct sk_buff; /* queueing.c APIs: */ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, - bool multicore, unsigned int len); -void wg_packet_queue_free(struct crypt_queue *queue, bool multicore); + unsigned int len); +void wg_packet_queue_free(struct crypt_queue *queue); struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); @@ -135,8 +136,31 @@ static inline int wg_cpumask_next_online(int *next) return cpu; } +void wg_prev_queue_init(struct prev_queue *queue); + +/* Multi producer */ +bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); + +/* Single consumer */ +struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); + +/* Single consumer */ +static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) +{ + if (queue->peeked) + return queue->peeked; + queue->peeked = wg_prev_queue_dequeue(queue); + return queue->peeked; +} + +/* Single consumer */ +static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) +{ + queue->peeked = NULL; +} + static inline int wg_queue_enqueue_per_device_and_peer( - struct crypt_queue *device_queue, struct crypt_queue *peer_queue, + struct crypt_queue *device_queue, struct prev_queue *peer_queue, struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu) { int cpu; @@ -145,8 +169,9 @@ static inline int wg_queue_enqueue_per_device_and_peer( /* We first queue this up for the peer ingestion, but the consumer * will wait for the state to change to CRYPTED or DEAD before. */ - if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb))) + if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) return -ENOSPC; + /* Then we queue it up in the device queue, which consumes the * packet as soon as it can. */ @@ -157,9 +182,7 @@ static inline int wg_queue_enqueue_per_device_and_peer( return 0; } -static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue, - struct sk_buff *skb, - enum packet_state state) +static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. @@ -167,14 +190,12 @@ static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue, struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); - queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, - peer->internal_id), - peer->device->packet_crypt_wq, &queue->work); + queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), + peer->device->packet_crypt_wq, &peer->transmit_packet_work); wg_peer_put(peer); } -static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb, - enum packet_state state) +static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 2c9551ea6dc7..7dc84bcca261 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -444,7 +444,6 @@ packet_processed: int wg_packet_rx_poll(struct napi_struct *napi, int budget) { struct wg_peer *peer = container_of(napi, struct wg_peer, napi); - struct crypt_queue *queue = &peer->rx_queue; struct noise_keypair *keypair; struct endpoint endpoint; enum packet_state state; @@ -455,11 +454,10 @@ int wg_packet_rx_poll(struct napi_struct *napi, int budget) if (unlikely(budget <= 0)) return 0; - while ((skb = __ptr_ring_peek(&queue->ring)) != NULL && + while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL && (state = atomic_read_acquire(&PACKET_CB(skb)->state)) != PACKET_STATE_UNCRYPTED) { - __ptr_ring_discard_one(&queue->ring); - peer = PACKET_PEER(skb); + wg_prev_queue_drop_peeked(&peer->rx_queue); keypair = PACKET_CB(skb)->keypair; free = true; @@ -508,7 +506,7 @@ void wg_packet_decrypt_worker(struct work_struct *work) enum packet_state state = likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ? PACKET_STATE_CRYPTED : PACKET_STATE_DEAD; - wg_queue_enqueue_per_peer_napi(skb, state); + wg_queue_enqueue_per_peer_rx(skb, state); if (need_resched()) cond_resched(); } @@ -531,12 +529,10 @@ static void wg_packet_consume_data(struct wg_device *wg, struct sk_buff *skb) if (unlikely(READ_ONCE(peer->is_dead))) goto err; - ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, - &peer->rx_queue, skb, - wg->packet_crypt_wq, - &wg->decrypt_queue.last_cpu); + ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb, + wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu); if (unlikely(ret == -EPIPE)) - wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD); + wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD); if (likely(!ret || ret == -EPIPE)) { rcu_read_unlock_bh(); return; diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index f74b9341ab0f..5368f7c35b4b 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -239,8 +239,7 @@ void wg_packet_send_keepalive(struct wg_peer *peer) wg_packet_send_staged_packets(peer); } -static void wg_packet_create_data_done(struct sk_buff *first, - struct wg_peer *peer) +static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first) { struct sk_buff *skb, *next; bool is_keepalive, data_sent = false; @@ -262,22 +261,19 @@ static void wg_packet_create_data_done(struct sk_buff *first, void wg_packet_tx_worker(struct work_struct *work) { - struct crypt_queue *queue = container_of(work, struct crypt_queue, - work); + struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work); struct noise_keypair *keypair; enum packet_state state; struct sk_buff *first; - struct wg_peer *peer; - while ((first = __ptr_ring_peek(&queue->ring)) != NULL && + while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL && (state = atomic_read_acquire(&PACKET_CB(first)->state)) != PACKET_STATE_UNCRYPTED) { - __ptr_ring_discard_one(&queue->ring); - peer = PACKET_PEER(first); + wg_prev_queue_drop_peeked(&peer->tx_queue); keypair = PACKET_CB(first)->keypair; if (likely(state == PACKET_STATE_CRYPTED)) - wg_packet_create_data_done(first, peer); + wg_packet_create_data_done(peer, first); else kfree_skb_list(first); @@ -306,16 +302,14 @@ void wg_packet_encrypt_worker(struct work_struct *work) break; } } - wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first, - state); + wg_queue_enqueue_per_peer_tx(first, state); if (need_resched()) cond_resched(); } } -static void wg_packet_create_data(struct sk_buff *first) +static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first) { - struct wg_peer *peer = PACKET_PEER(first); struct wg_device *wg = peer->device; int ret = -EINVAL; @@ -323,13 +317,10 @@ static void wg_packet_create_data(struct sk_buff *first) if (unlikely(READ_ONCE(peer->is_dead))) goto err; - ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, - &peer->tx_queue, first, - wg->packet_crypt_wq, - &wg->encrypt_queue.last_cpu); + ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first, + wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu); if (unlikely(ret == -EPIPE)) - wg_queue_enqueue_per_peer(&peer->tx_queue, first, - PACKET_STATE_DEAD); + wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD); err: rcu_read_unlock_bh(); if (likely(!ret || ret == -EPIPE)) @@ -393,7 +384,7 @@ void wg_packet_send_staged_packets(struct wg_peer *peer) packets.prev->next = NULL; wg_peer_get(keypair->entry.peer); PACKET_CB(packets.next)->keypair = keypair; - wg_packet_create_data(packets.next); + wg_packet_create_data(peer, packets.next); return; out_invalid: From bce2473927af8de12ad131a743f55d69d358c0b9 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:49 +0100 Subject: [PATCH 44/55] wireguard: kconfig: use arm chacha even with no neon The condition here was incorrect: a non-neon fallback implementation is available on arm32 when NEON is not supported. Reported-by: Ilya Lipnitskiy Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 1ebb4b943876..4a01ffa4350a 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -88,7 +88,7 @@ config WIREGUARD select CRYPTO_CURVE25519_X86 if X86 && 64BIT select ARM_CRYPTO if ARM select ARM64_CRYPTO if ARM64 - select CRYPTO_CHACHA20_NEON if (ARM || ARM64) && KERNEL_MODE_NEON + select CRYPTO_CHACHA20_NEON if ARM || (ARM64 && KERNEL_MODE_NEON) select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON select CRYPTO_POLY1305_ARM if ARM select CRYPTO_CURVE25519_NEON if ARM && KERNEL_MODE_NEON From 88eee9b7b42e69fb622ddb3ff6f37e8e4347f5b2 Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Tue, 23 Feb 2021 19:34:56 +0100 Subject: [PATCH 45/55] net: usb: qmi_wwan: support ZTE P685M modem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that interface 3 in "option" driver is no longer mapped, add device ID matching it to qmi_wwan. The modem is used inside ZTE MF283+ router and carriers identify it as such. Interface mapping is: 0: QCDM, 1: AT (PCUI), 2: AT (Modem), 3: QMI, 4: ADB T: Bus=02 Lev=02 Prnt=02 Port=05 Cnt=01 Dev#= 3 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=19d2 ProdID=1275 Rev=f0.00 S: Manufacturer=ZTE,Incorporated S: Product=ZTE Technologies MSM S: SerialNumber=P685M510ZTED0000CP&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&0 C:* #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=87(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Acked-by: Bjørn Mork Signed-off-by: Lech Perczak Link: https://lore.kernel.org/r/20210223183456.6377-1-lech.perczak@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 6c3d8c2abd38..17a050521b86 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1318,6 +1318,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x19d2, 0x1255, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1256, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1270, 5)}, /* ZTE MF667 */ + {QMI_FIXED_INTF(0x19d2, 0x1275, 3)}, /* ZTE P685M */ {QMI_FIXED_INTF(0x19d2, 0x1401, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1402, 2)}, /* ZTE MF60 */ {QMI_FIXED_INTF(0x19d2, 0x1424, 2)}, From 4e096a18867a5a989b510f6999d9c6b6622e8f7b Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 23 Feb 2021 08:01:26 +0100 Subject: [PATCH 46/55] net: introduce CAN specific pointer in the struct net_device Since 20dd3850bcf8 ("can: Speed up CAN frame receiption by using ml_priv") the CAN framework uses per device specific data in the AF_CAN protocol. For this purpose the struct net_device->ml_priv is used. Later the ml_priv usage in CAN was extended for other users, one of them being CAN_J1939. Later in the kernel ml_priv was converted to an union, used by other drivers. E.g. the tun driver started storing it's stats pointer. Since tun devices can claim to be a CAN device, CAN specific protocols will wrongly interpret this pointer, which will cause system crashes. Mostly this issue is visible in the CAN_J1939 stack. To fix this issue, we request a dedicated CAN pointer within the net_device struct. Reported-by: syzbot+5138c4dd15a0401bec7b@syzkaller.appspotmail.com Fixes: 20dd3850bcf8 ("can: Speed up CAN frame receiption by using ml_priv") Fixes: ffd956eef69b ("can: introduce CAN midlayer private and allocate it automatically") Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Fixes: 497a5757ce4e ("tun: switch to net core provided statistics counters") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20210223070127.4538-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/can/dev/dev.c | 4 +++- drivers/net/can/slcan.c | 4 +++- drivers/net/can/vcan.c | 2 +- drivers/net/can/vxcan.c | 6 +++++- include/linux/can/can-ml.h | 12 ++++++++++++ include/linux/netdevice.h | 34 +++++++++++++++++++++++++++++++++- net/can/af_can.c | 34 ++-------------------------------- net/can/j1939/main.c | 22 ++++++++-------------- net/can/j1939/socket.c | 13 ++++--------- net/can/proc.c | 19 +++++++++++++------ 10 files changed, 84 insertions(+), 66 deletions(-) diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index d9281ae853f8..311d8564d611 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -239,6 +239,7 @@ void can_setup(struct net_device *dev) struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, unsigned int txqs, unsigned int rxqs) { + struct can_ml_priv *can_ml; struct net_device *dev; struct can_priv *priv; int size; @@ -270,7 +271,8 @@ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, priv = netdev_priv(dev); priv->dev = dev; - dev->ml_priv = (void *)priv + ALIGN(sizeof_priv, NETDEV_ALIGN); + can_ml = (void *)priv + ALIGN(sizeof_priv, NETDEV_ALIGN); + can_set_ml_priv(dev, can_ml); if (echo_skb_max) { priv->echo_skb_max = echo_skb_max; diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index a1bd1be09548..30c8d53c9745 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -516,6 +516,7 @@ static struct slcan *slc_alloc(void) int i; char name[IFNAMSIZ]; struct net_device *dev = NULL; + struct can_ml_priv *can_ml; struct slcan *sl; int size; @@ -538,7 +539,8 @@ static struct slcan *slc_alloc(void) dev->base_addr = i; sl = netdev_priv(dev); - dev->ml_priv = (void *)sl + ALIGN(sizeof(*sl), NETDEV_ALIGN); + can_ml = (void *)sl + ALIGN(sizeof(*sl), NETDEV_ALIGN); + can_set_ml_priv(dev, can_ml); /* Initialize channel control data */ sl->magic = SLCAN_MAGIC; diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index 39ca14b0585d..067705e2850b 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -153,7 +153,7 @@ static void vcan_setup(struct net_device *dev) dev->addr_len = 0; dev->tx_queue_len = 0; dev->flags = IFF_NOARP; - dev->ml_priv = netdev_priv(dev); + can_set_ml_priv(dev, netdev_priv(dev)); /* set flags according to driver capabilities */ if (echo) diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index f9a524c5f6d6..8861a7d875e7 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -141,6 +141,8 @@ static const struct net_device_ops vxcan_netdev_ops = { static void vxcan_setup(struct net_device *dev) { + struct can_ml_priv *can_ml; + dev->type = ARPHRD_CAN; dev->mtu = CANFD_MTU; dev->hard_header_len = 0; @@ -149,7 +151,9 @@ static void vxcan_setup(struct net_device *dev) dev->flags = (IFF_NOARP|IFF_ECHO); dev->netdev_ops = &vxcan_netdev_ops; dev->needs_free_netdev = true; - dev->ml_priv = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN); + + can_ml = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN); + can_set_ml_priv(dev, can_ml); } /* forward declaration for rtnl_create_link() */ diff --git a/include/linux/can/can-ml.h b/include/linux/can/can-ml.h index 2f5d731ae251..8afa92d15a66 100644 --- a/include/linux/can/can-ml.h +++ b/include/linux/can/can-ml.h @@ -44,6 +44,7 @@ #include #include +#include #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) #define CAN_EFF_RCV_HASH_BITS 10 @@ -65,4 +66,15 @@ struct can_ml_priv { #endif }; +static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev) +{ + return netdev_get_ml_priv(dev, ML_PRIV_CAN); +} + +static inline void can_set_ml_priv(struct net_device *dev, + struct can_ml_priv *ml_priv) +{ + netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN); +} + #endif /* CAN_ML_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ddf4cfc12615..f06fbee8638e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1584,6 +1584,12 @@ enum netdev_priv_flags { #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK +/* Specifies the type of the struct net_device::ml_priv pointer */ +enum netdev_ml_priv_type { + ML_PRIV_NONE, + ML_PRIV_CAN, +}; + /** * struct net_device - The DEVICE structure. * @@ -1779,6 +1785,7 @@ enum netdev_priv_flags { * @nd_net: Network namespace this network device is inside * * @ml_priv: Mid-layer private + * @ml_priv_type: Mid-layer private type * @lstats: Loopback statistics * @tstats: Tunnel statistics * @dstats: Dummy statistics @@ -2094,8 +2101,10 @@ struct net_device { possible_net_t nd_net; /* mid-layer private */ + void *ml_priv; + enum netdev_ml_priv_type ml_priv_type; + union { - void *ml_priv; struct pcpu_lstats __percpu *lstats; struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; @@ -2286,6 +2295,29 @@ static inline void netdev_reset_rx_headroom(struct net_device *dev) netdev_set_rx_headroom(dev, -1); } +static inline void *netdev_get_ml_priv(struct net_device *dev, + enum netdev_ml_priv_type type) +{ + if (dev->ml_priv_type != type) + return NULL; + + return dev->ml_priv; +} + +static inline void netdev_set_ml_priv(struct net_device *dev, + void *ml_priv, + enum netdev_ml_priv_type type) +{ + WARN(dev->ml_priv_type && dev->ml_priv_type != type, + "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n", + dev->ml_priv_type, type); + WARN(!dev->ml_priv_type && dev->ml_priv, + "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n"); + + dev->ml_priv = ml_priv; + dev->ml_priv_type = type; +} + /* * Net namespace inlines */ diff --git a/net/can/af_can.c b/net/can/af_can.c index 837bb8af0ec3..cce2af10eb3e 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -304,8 +304,8 @@ static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net, struct net_device *dev) { if (dev) { - struct can_ml_priv *ml_priv = dev->ml_priv; - return &ml_priv->dev_rcv_lists; + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + return &can_ml->dev_rcv_lists; } else { return net->can.rx_alldev_list; } @@ -790,25 +790,6 @@ void can_proto_unregister(const struct can_proto *cp) } EXPORT_SYMBOL(can_proto_unregister); -/* af_can notifier to create/remove CAN netdevice specific structs */ -static int can_notifier(struct notifier_block *nb, unsigned long msg, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - if (dev->type != ARPHRD_CAN) - return NOTIFY_DONE; - - switch (msg) { - case NETDEV_REGISTER: - WARN(!dev->ml_priv, - "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n"); - break; - } - - return NOTIFY_DONE; -} - static int can_pernet_init(struct net *net) { spin_lock_init(&net->can.rcvlists_lock); @@ -876,11 +857,6 @@ static const struct net_proto_family can_family_ops = { .owner = THIS_MODULE, }; -/* notifier block for netdevice event */ -static struct notifier_block can_netdev_notifier __read_mostly = { - .notifier_call = can_notifier, -}; - static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, @@ -911,17 +887,12 @@ static __init int can_init(void) err = sock_register(&can_family_ops); if (err) goto out_sock; - err = register_netdevice_notifier(&can_netdev_notifier); - if (err) - goto out_notifier; dev_add_pack(&can_packet); dev_add_pack(&canfd_packet); return 0; -out_notifier: - sock_unregister(PF_CAN); out_sock: unregister_pernet_subsys(&can_pernet_ops); out_pernet: @@ -935,7 +906,6 @@ static __exit void can_exit(void) /* protocol unregister */ dev_remove_pack(&canfd_packet); dev_remove_pack(&can_packet); - unregister_netdevice_notifier(&can_netdev_notifier); sock_unregister(PF_CAN); unregister_pernet_subsys(&can_pernet_ops); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index bb914d8b4216..da3a7a7bcff2 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -140,9 +140,9 @@ static struct j1939_priv *j1939_priv_create(struct net_device *ndev) static inline void j1939_priv_set(struct net_device *ndev, struct j1939_priv *priv) { - struct can_ml_priv *can_ml_priv = ndev->ml_priv; + struct can_ml_priv *can_ml = can_get_ml_priv(ndev); - can_ml_priv->j1939_priv = priv; + can_ml->j1939_priv = priv; } static void __j1939_priv_release(struct kref *kref) @@ -211,12 +211,9 @@ static void __j1939_rx_release(struct kref *kref) /* get pointer to priv without increasing ref counter */ static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev) { - struct can_ml_priv *can_ml_priv = ndev->ml_priv; + struct can_ml_priv *can_ml = can_get_ml_priv(ndev); - if (!can_ml_priv) - return NULL; - - return can_ml_priv->j1939_priv; + return can_ml->j1939_priv; } static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev) @@ -225,9 +222,6 @@ static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev) lockdep_assert_held(&j1939_netdev_lock); - if (ndev->type != ARPHRD_CAN) - return NULL; - priv = j1939_ndev_to_priv(ndev); if (priv) j1939_priv_get(priv); @@ -348,15 +342,16 @@ static int j1939_netdev_notify(struct notifier_block *nb, unsigned long msg, void *data) { struct net_device *ndev = netdev_notifier_info_to_dev(data); + struct can_ml_priv *can_ml = can_get_ml_priv(ndev); struct j1939_priv *priv; + if (!can_ml) + goto notify_done; + priv = j1939_priv_get_by_ndev(ndev); if (!priv) goto notify_done; - if (ndev->type != ARPHRD_CAN) - goto notify_put; - switch (msg) { case NETDEV_DOWN: j1939_cancel_active_session(priv, NULL); @@ -365,7 +360,6 @@ static int j1939_netdev_notify(struct notifier_block *nb, break; } -notify_put: j1939_priv_put(priv); notify_done: diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index f23966526a88..56aa66147d5a 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -453,6 +454,7 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { + struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); @@ -461,15 +463,8 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) goto out_release_sock; } - if (ndev->type != ARPHRD_CAN) { - dev_put(ndev); - ret = -ENODEV; - goto out_release_sock; - } - - if (!ndev->ml_priv) { - netdev_warn_once(ndev, - "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n"); + can_ml = can_get_ml_priv(ndev); + if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; diff --git a/net/can/proc.c b/net/can/proc.c index 5ea8695f507e..b15760b5c1cc 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -322,8 +322,11 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v) /* receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv) - can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv); + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + if (can_ml) + can_rcvlist_proc_show_one(m, idx, dev, + &can_ml->dev_rcv_lists); } rcu_read_unlock(); @@ -375,8 +378,10 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) /* sff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv) { - dev_rcv_lists = dev->ml_priv; + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + if (can_ml) { + dev_rcv_lists = &can_ml->dev_rcv_lists; can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff, ARRAY_SIZE(dev_rcv_lists->rx_sff)); } @@ -406,8 +411,10 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) /* eff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv) { - dev_rcv_lists = dev->ml_priv; + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + if (can_ml) { + dev_rcv_lists = &can_ml->dev_rcv_lists; can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff, ARRAY_SIZE(dev_rcv_lists->rx_eff)); } From 17d7fd47aa9063c2ff36988e36757ac345733e28 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 23 Feb 2021 10:48:03 +0000 Subject: [PATCH 47/55] net: stmmac: Fix missing spin_lock_init in visconti_eth_dwmac_probe() The driver allocates the spinlock but not initialize it. Use spin_lock_init() on it to initialize it correctly. Fixes: b38dd98ff8d0 ("net: stmmac: Add Toshiba Visconti SoCs glue driver") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Acked-by: Nobuhiro Iwamatsu Link: https://lore.kernel.org/r/20210223104803.4047281-1-weiyongjun1@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index b7a0c57dfbfb..d23be45a64e5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -218,6 +218,7 @@ static int visconti_eth_dwmac_probe(struct platform_device *pdev) goto remove_config; } + spin_lock_init(&dwmac->lock); dwmac->reg = stmmac_res.addr; plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = visconti_eth_fix_mac_speed; From 8f1c0fd2c84c8bf738b7139d09d4ea53027f47c3 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 23 Feb 2021 21:02:29 -0800 Subject: [PATCH 48/55] ibmvnic: fix a race between open and reset __ibmvnic_reset() currently reads the adapter->state before getting the rtnl and saves that state as the "target state" for the reset. If this read occurs when adapter is in PROBED state, the target state would be PROBED. Just after the target state is saved, and before the actual reset process is started (i.e before rtnl is acquired) if we get an ibmvnic_open() call we would move the adapter to OPEN state. But when the reset is processed (after ibmvnic_open()) drops the rtnl), it will leave the adapter in PROBED state even though we already moved it to OPEN. To fix this, use the RTNL to improve serialization when reading/updating the adapter state. i.e determine the target state of a reset only after getting the RTNL. And if a reset is in progress during an open, simply set the target state of the adapter and let the reset code finish the open (like we currently do if failover is pending). One twist to this serialization is if the adapter state changes when we drop the RTNL to update the link state. Account for this by checking if there was an intervening open and update the target state for the reset accordingly (see new comments in the code). Note that only the reset functions and ibmvnic_open() can set the adapter to OPEN state and this must happen under rtnl. Fixes: 7d7195a026ba ("ibmvnic: Do not process device remove during device reset") Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Dany Madden Link: https://lore.kernel.org/r/20210224050229.1155468-1-sukadev@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 63 ++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 1c0e4beb56e7..118a4bd3f877 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1172,12 +1172,25 @@ static int ibmvnic_open(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); int rc; - /* If device failover is pending, just set device state and return. - * Device operation will be handled by reset routine. + ASSERT_RTNL(); + + /* If device failover is pending or we are about to reset, just set + * device state and return. Device operation will be handled by reset + * routine. + * + * It should be safe to overwrite the adapter->state here. Since + * we hold the rtnl, either the reset has not actually started or + * the rtnl got dropped during the set_link_state() in do_reset(). + * In the former case, no one else is changing the state (again we + * have the rtnl) and in the latter case, do_reset() will detect and + * honor our setting below. */ - if (adapter->failover_pending) { + if (adapter->failover_pending || (test_bit(0, &adapter->resetting))) { + netdev_dbg(netdev, "[S:%d FOP:%d] Resetting, deferring open\n", + adapter->state, adapter->failover_pending); adapter->state = VNIC_OPEN; - return 0; + rc = 0; + goto out; } if (adapter->state != VNIC_CLOSED) { @@ -1196,10 +1209,12 @@ static int ibmvnic_open(struct net_device *netdev) rc = __ibmvnic_open(netdev); out: - /* If open fails due to a pending failover, set device state and - * return. Device operation will be handled by reset routine. + /* If open failed and there is a pending failover or in-progress reset, + * set device state and return. Device operation will be handled by + * reset routine. See also comments above regarding rtnl. */ - if (rc && adapter->failover_pending) { + if (rc && + (adapter->failover_pending || (test_bit(0, &adapter->resetting)))) { adapter->state = VNIC_OPEN; rc = 0; } @@ -1928,6 +1943,14 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (rwi->reset_reason == VNIC_RESET_FAILOVER) adapter->failover_pending = false; + /* read the state and check (again) after getting rtnl */ + reset_state = adapter->state; + + if (reset_state == VNIC_REMOVING || reset_state == VNIC_REMOVED) { + rc = -EBUSY; + goto out; + } + netif_carrier_off(netdev); old_num_rx_queues = adapter->req_rx_queues; @@ -1958,11 +1981,27 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (rc) goto out; + if (adapter->state == VNIC_OPEN) { + /* When we dropped rtnl, ibmvnic_open() got + * it and noticed that we are resetting and + * set the adapter state to OPEN. Update our + * new "target" state, and resume the reset + * from VNIC_CLOSING state. + */ + netdev_dbg(netdev, + "Open changed state from %d, updating.\n", + reset_state); + reset_state = VNIC_OPEN; + adapter->state = VNIC_CLOSING; + } + if (adapter->state != VNIC_CLOSING) { + /* If someone else changed the adapter state + * when we dropped the rtnl, fail the reset + */ rc = -1; goto out; } - adapter->state = VNIC_CLOSED; } } @@ -2106,6 +2145,14 @@ static int do_hard_reset(struct ibmvnic_adapter *adapter, netdev_dbg(adapter->netdev, "Hard resetting driver (%d)\n", rwi->reset_reason); + /* read the state and check (again) after getting rtnl */ + reset_state = adapter->state; + + if (reset_state == VNIC_REMOVING || reset_state == VNIC_REMOVED) { + rc = -EBUSY; + goto out; + } + netif_carrier_off(netdev); adapter->reset_reason = rwi->reset_reason; From fcd4ba3bcba78a97a0f8bdb5df37bc74820f9a62 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 23 Feb 2021 12:20:03 +0100 Subject: [PATCH 49/55] net: dsa: sja1105: Remove unneeded cast in sja1105_crc32() sja1105_unpack() takes a "const void *buf" as its first parameter, so there is no need to cast away the "const" of the "buf" variable before calling it. Drop the cast, as it prevents the compiler performing some checks. Signed-off-by: Geert Uytterhoeven Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20210223112003.2223332-1-geert+renesas@glider.be Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105_static_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c index 139b7b4fbd0d..a8efb7fac395 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.c +++ b/drivers/net/dsa/sja1105/sja1105_static_config.c @@ -85,7 +85,7 @@ u32 sja1105_crc32(const void *buf, size_t len) /* seed */ crc = ~0; for (i = 0; i < len; i += 4) { - sja1105_unpack((void *)buf + i, &word, 31, 0, 4); + sja1105_unpack(buf + i, &word, 31, 0, 4); crc = crc32_le(crc, (u8 *)&word, 4); } return ~crc; From f176411401127a07a9360dec14eca448eb2e9d45 Mon Sep 17 00:00:00 2001 From: Marco Wenzel Date: Wed, 24 Feb 2021 10:46:49 +0100 Subject: [PATCH 50/55] net: hsr: add support for EntryForgetTime In IEC 62439-3 EntryForgetTime is defined with a value of 400 ms. When a node does not send any frame within this time, the sequence number check for can be ignored. This solves communication issues with Cisco IE 2000 in Redbox mode. Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Marco Wenzel Reviewed-by: George McCollister Tested-by: George McCollister Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210224094653.1440-1-marco.wenzel@a-eberle.de Signed-off-by: Jakub Kicinski --- net/hsr/hsr_framereg.c | 9 +++++++-- net/hsr/hsr_framereg.h | 1 + net/hsr/hsr_main.h | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index f9a8cc82ae2e..bb1351c38397 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -164,8 +164,10 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; - for (i = 0; i < HSR_PT_PORTS; i++) + for (i = 0; i < HSR_PT_PORTS; i++) { new_node->time_in[i] = now; + new_node->time_out[i] = now; + } for (i = 0; i < HSR_PT_PORTS; i++) new_node->seq_out[i] = seq_out; @@ -413,9 +415,12 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, u16 sequence_nr) { - if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type])) + if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && + time_is_after_jiffies(node->time_out[port->type] + + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) return 1; + node->time_out[port->type] = jiffies; node->seq_out[port->type] = sequence_nr; return 0; } diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 86b43f539f2c..d9628e7a5f05 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -75,6 +75,7 @@ struct hsr_node { enum hsr_port_type addr_B_port; unsigned long time_in[HSR_PT_PORTS]; bool time_in_stale[HSR_PT_PORTS]; + unsigned long time_out[HSR_PT_PORTS]; /* if the node is a SAN */ bool san_a; bool san_b; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index a169808ee78a..8f264672b70b 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -22,6 +22,7 @@ #define HSR_LIFE_CHECK_INTERVAL 2000 /* ms */ #define HSR_NODE_FORGET_TIME 60000 /* ms */ #define HSR_ANNOUNCE_INTERVAL 100 /* ms */ +#define HSR_ENTRY_FORGET_TIME 400 /* ms */ /* By how much may slave1 and slave2 timestamps of latest received frame from * each node differ before we notify of communication problem? From 4dc7f09b8becfa35a55430a49d95acf19f996e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 24 Feb 2021 16:18:41 +0100 Subject: [PATCH 51/55] net: broadcom: bcm4908_enet: fix RX path possible mem leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After filling RX ring slot with new skb it's required to free old skb. Immediately on error or later in the net subsystem. Fixes: 4feffeadbcb2 ("net: broadcom: bcm4908enet: add BCM4908 controller driver") Signed-off-by: Rafał Miłecki Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20210224151842.2419-1-zajec5@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcm4908_enet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c index 9be33dc98072..7983c7a9fca9 100644 --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c @@ -570,6 +570,7 @@ static int bcm4908_enet_poll(struct napi_struct *napi, int weight) if (len < ETH_ZLEN || (ctl & (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) != (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) { + kfree_skb(slot.skb); enet->netdev->stats.rx_dropped++; break; } From 4d9274cee40b6a20dd6148c6c81c6733c2678cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 24 Feb 2021 16:18:42 +0100 Subject: [PATCH 52/55] net: broadcom: bcm4908_enet: fix NAPI poll returned value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missing increment was resulting in poll function always returning 0 instead of amount of processed packets. Fixes: 4feffeadbcb2 ("net: broadcom: bcm4908enet: add BCM4908 controller driver") Signed-off-by: Rafał Miłecki Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20210224151842.2419-2-zajec5@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcm4908_enet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c index 7983c7a9fca9..0b70e9e0ddad 100644 --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c @@ -583,6 +583,8 @@ static int bcm4908_enet_poll(struct napi_struct *napi, int weight) enet->netdev->stats.rx_packets++; enet->netdev->stats.rx_bytes += len; + + handled++; } if (handled < weight) { From a93dcaada2ddb58dbc72652b42548adedd646d7a Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Thu, 25 Feb 2021 15:51:45 +0800 Subject: [PATCH 53/55] net: psample: Fix netlink skb length with tunnel info Currently, the psample netlink skb is allocated with a size that does not account for the nested 'PSAMPLE_ATTR_TUNNEL' attribute and the padding required for the 64-bit attribute 'PSAMPLE_TUNNEL_KEY_ATTR_ID'. This can result in failure to add attributes to the netlink skb due to insufficient tail room. The following error message is printed to the kernel log: "Could not create psample log message". Fix this by adjusting the allocation size to take into account the nested attribute and the padding. Fixes: d8bed686ab96 ("net: psample: Add tunnel support") CC: Yotam Gigi Reviewed-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Chris Mi Link: https://lore.kernel.org/r/20210225075145.184314-1-cmi@nvidia.com Signed-off-by: Jakub Kicinski --- net/psample/psample.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/psample/psample.c b/net/psample/psample.c index 33e238c965bd..482c07f2766b 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -309,10 +309,10 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) unsigned short tun_proto = ip_tunnel_info_af(tun_info); const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; - int sum = 0; + int sum = nla_total_size(0); /* PSAMPLE_ATTR_TUNNEL */ if (tun_key->tun_flags & TUNNEL_KEY) - sum += nla_total_size(sizeof(u64)); + sum += nla_total_size_64bit(sizeof(u64)); if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) sum += nla_total_size(0); From 764d31cacfe48440745c4bbb55a62ac9471c9f19 Mon Sep 17 00:00:00 2001 From: Christian Melki Date: Wed, 24 Feb 2021 21:55:36 +0100 Subject: [PATCH 54/55] net: phy: micrel: set soft_reset callback to genphy_soft_reset for KSZ8081 Following a similar reinstate for the KSZ9031. Older kernels would use the genphy_soft_reset if the PHY did not implement a .soft_reset. Bluntly removing that default may expose a lot of situations where various PHYs/board implementations won't recover on various changes. Like with this implementation during a 4.9.x to 5.4.x LTS transition. I think it's a good thing to remove unwanted soft resets but wonder if it did open a can of worms? Atleast this fixes one iMX6 FEC/RMII/8081 combo. Fixes: 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") Signed-off-by: Christian Melki Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210224205536.9349-1-christian.melki@t2data.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 7ec6f70d6a82..a14a00328fa3 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1303,6 +1303,7 @@ static struct phy_driver ksphy_driver[] = { .driver_data = &ksz8081_type, .probe = kszphy_probe, .config_init = ksz8081_config_init, + .soft_reset = genphy_soft_reset, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, .get_sset_count = kszphy_get_sset_count, From 6cf739131a15e4177e58a1b4f2bede9d5da78552 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 25 Feb 2021 16:05:19 +0100 Subject: [PATCH 55/55] r8169: fix jumbo packet handling on RTL8168e MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Josef reported [0] that using jumbo packets fails on RTL8168e. Aligning the values for register MaxTxPacketSize with the vendor driver fixes the problem. [0] https://bugzilla.kernel.org/show_bug.cgi?id=211827 Fixes: d58d46b5d851 ("r8169: jumbo fixes.") Reported-by: Josef Oškera Tested-by: Josef Oškera Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/b15ddef7-0d50-4320-18f4-6a3f86fbfd3e@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0a20dae32184..f704da3f214c 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2285,14 +2285,14 @@ static void r8168dp_hw_jumbo_disable(struct rtl8169_private *tp) static void r8168e_hw_jumbo_enable(struct rtl8169_private *tp) { - RTL_W8(tp, MaxTxPacketSize, 0x3f); + RTL_W8(tp, MaxTxPacketSize, 0x24); RTL_W8(tp, Config3, RTL_R8(tp, Config3) | Jumbo_En0); RTL_W8(tp, Config4, RTL_R8(tp, Config4) | 0x01); } static void r8168e_hw_jumbo_disable(struct rtl8169_private *tp) { - RTL_W8(tp, MaxTxPacketSize, 0x0c); + RTL_W8(tp, MaxTxPacketSize, 0x3f); RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Jumbo_En0); RTL_W8(tp, Config4, RTL_R8(tp, Config4) & ~0x01); }