Including fixes from bpf and netfilter.

Current release - regressions:
 
   - core: fix rc7's __skb_datagram_iter() regression
 
 Current release - new code bugs:
 
   - eth: bnxt: fix crashes when reducing ring count with active RSS contexts
 
 Previous releases - regressions:
 
   - sched: fix UAF when resolving a clash
 
   - skmsg: skip zero length skb in sk_msg_recvmsg2
 
   - sunrpc: fix kernel free on connection failure in xs_tcp_setup_socket
 
   - tcp: avoid too many retransmit packets
 
   - tcp: fix incorrect undo caused by DSACK of TLP retransmit
 
   - udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port().
 
   - eth: ks8851: fix deadlock with the SPI chip variant
 
   - eth: i40e: fix XDP program unloading while removing the driver
 
 Previous releases - always broken:
 
   - bpf:
     - fix too early release of tcx_entry
     - fail bpf_timer_cancel when callback is being cancelled
     - bpf: fix order of args in call to bpf_map_kvcalloc
 
   - netfilter: nf_tables: prefer nft_chain_validate
 
   - ppp: reject claimed-as-LCP but actually malformed packets
 
   - wireguard: avoid unaligned 64-bit memory accesses
 
 Signed-off-by: Paolo Abeni <pabeni@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQJGBAABCAAwFiEEg1AjqC77wbdLX2LbKSR5jcyPE6QFAmaP4GYSHHBhYmVuaUBy
 ZWRoYXQuY29tAAoJECkkeY3MjxOkzUgQAKroA17iMt2rD8h385hL8T9r483CGsR+
 MX6SuWn6T8v4cuhKbqhkf25pWOD0mKH2i+dmgYon7g9LjLG4DMiZBZAqmBwArbyM
 mITgndWH57MnQQh3pgkDFp0lhzYkeERCVSgcgh2AFTcNoxXbazHkMMIghsBENx3+
 wccTsqtPmT2GWRpw6IrHO6kUs98Gry4O2p6fw3dX3/umD0z8OgnyRoCdVkylCDTM
 2tBl4rWsXw8LvzSzmQ7qX3FSzdS++RJk2iXKWdrglah8cuKohZ98WbUwlt2ObCxz
 fLbaAocPzOijaX2YAsNzKYzWizq0i4IjpgSebNUcI3YFthAog5nNGJv79w+cxBKy
 NpaQA31Hd6K6oFJybkysHAf776RC3ueF48Isp3kag7NQ8Qy2+nfWAM9g1wq4UnOu
 IePFdgojqUbfp3GzOG5yFyqqD8RPppJp7DowSjjfYN8Dxw1y5R090suDyrjFeiiC
 MezV4xu7vZdi/6R8RXVfskR/iczHqDHGuuMikTlkm1LaLty9dfnoIZC5AagFH1rA
 Jkzztkd9MnSGK94G9upIhu+t8F22/wzmrJhJOgl9LFvbXP91uGjZGr75u4cc8Q/G
 jn2uy05T/vzoBSiLRQc0Z2Wp9GYEWRXHKSjLkabrGeakw6tYYgZSAVAlxqrpvqbV
 +fm9Ibpvs9fb
 =qwVA
 -----END PGP SIGNATURE-----

Merge tag 'net-6.10-rc8' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net

Pull networking fixes from Paolo Abeni:
 "Including fixes from bpf and netfilter.

  Current release - regressions:

   - core: fix rc7's __skb_datagram_iter() regression

  Current release - new code bugs:

   - eth: bnxt: fix crashes when reducing ring count with active RSS
     contexts

  Previous releases - regressions:

   - sched: fix UAF when resolving a clash

   - skmsg: skip zero length skb in sk_msg_recvmsg2

   - sunrpc: fix kernel free on connection failure in
     xs_tcp_setup_socket

   - tcp: avoid too many retransmit packets

   - tcp: fix incorrect undo caused by DSACK of TLP retransmit

   - udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port().

   - eth: ks8851: fix deadlock with the SPI chip variant

   - eth: i40e: fix XDP program unloading while removing the driver

  Previous releases - always broken:

   - bpf:
       - fix too early release of tcx_entry
       - fail bpf_timer_cancel when callback is being cancelled
       - bpf: fix order of args in call to bpf_map_kvcalloc

   - netfilter: nf_tables: prefer nft_chain_validate

   - ppp: reject claimed-as-LCP but actually malformed packets

   - wireguard: avoid unaligned 64-bit memory accesses"

* tag 'net-6.10-rc8' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (33 commits)
  net, sunrpc: Remap EPERM in case of connection failure in xs_tcp_setup_socket
  net/sched: Fix UAF when resolving a clash
  net: ks8851: Fix potential TX stall after interface reopen
  udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port().
  netfilter: nf_tables: prefer nft_chain_validate
  netfilter: nfnetlink_queue: drop bogus WARN_ON
  ethtool: netlink: do not return SQI value if link is down
  ppp: reject claimed-as-LCP but actually malformed packets
  selftests/bpf: Add timer lockup selftest
  net: ethernet: mtk-star-emac: set mac_managed_pm when probing
  e1000e: fix force smbus during suspend flow
  tcp: avoid too many retransmit packets
  bpf: Defer work in bpf_timer_cancel_and_free
  bpf: Fail bpf_timer_cancel when callback is being cancelled
  bpf: fix order of args in call to bpf_map_kvcalloc
  net: ethernet: lantiq_etop: fix double free in detach
  i40e: Fix XDP program unloading while removing the driver
  net: fix rc7's __skb_datagram_iter()
  net: ks8851: Fix deadlock with the SPI chip variant
  octeontx2-af: Fix incorrect value output on error path in rvu_check_rsrc_availability()
  ...
This commit is contained in:
Linus Torvalds 2024-07-11 09:29:49 -07:00
commit 51df8e0cba
37 changed files with 561 additions and 255 deletions

View File

@ -49,7 +49,7 @@ example usage
$ devlink region show [ DEV/REGION ]
$ devlink region del DEV/REGION snapshot SNAPSHOT_ID
$ devlink region dump DEV/REGION [ snapshot SNAPSHOT_ID ]
$ devlink region read DEV/REGION [ snapshot SNAPSHOT_ID ] address ADDRESS length length
$ devlink region read DEV/REGION [ snapshot SNAPSHOT_ID ] address ADDRESS length LENGTH
# Show all of the exposed regions with region sizes:
$ devlink region show

View File

@ -1047,31 +1047,31 @@ static int lan9303_get_sset_count(struct dsa_switch *ds, int port, int sset)
return ARRAY_SIZE(lan9303_mib);
}
static int lan9303_phy_read(struct dsa_switch *ds, int phy, int regnum)
static int lan9303_phy_read(struct dsa_switch *ds, int port, int regnum)
{
struct lan9303 *chip = ds->priv;
int phy_base = chip->phy_addr_base;
if (phy == phy_base)
if (port == 0)
return lan9303_virt_phy_reg_read(chip, regnum);
if (phy > phy_base + 2)
if (port > 2)
return -ENODEV;
return chip->ops->phy_read(chip, phy, regnum);
return chip->ops->phy_read(chip, phy_base + port, regnum);
}
static int lan9303_phy_write(struct dsa_switch *ds, int phy, int regnum,
static int lan9303_phy_write(struct dsa_switch *ds, int port, int regnum,
u16 val)
{
struct lan9303 *chip = ds->priv;
int phy_base = chip->phy_addr_base;
if (phy == phy_base)
if (port == 0)
return lan9303_virt_phy_reg_write(chip, regnum, val);
if (phy > phy_base + 2)
if (port > 2)
return -ENODEV;
return chip->ops->phy_write(chip, phy, regnum, val);
return chip->ops->phy_write(chip, phy_base + port, regnum, val);
}
static int lan9303_port_enable(struct dsa_switch *ds, int port,
@ -1099,7 +1099,7 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port)
vlan_vid_del(dsa_port_to_conduit(dp), htons(ETH_P_8021Q), port);
lan9303_disable_processing_port(chip, port);
lan9303_phy_write(ds, chip->phy_addr_base + port, MII_BMCR, BMCR_PDOWN);
lan9303_phy_write(ds, port, MII_BMCR, BMCR_PDOWN);
}
static int lan9303_port_bridge_join(struct dsa_switch *ds, int port,
@ -1374,8 +1374,6 @@ static const struct dsa_switch_ops lan9303_switch_ops = {
static int lan9303_register_switch(struct lan9303 *chip)
{
int base;
chip->ds = devm_kzalloc(chip->dev, sizeof(*chip->ds), GFP_KERNEL);
if (!chip->ds)
return -ENOMEM;
@ -1385,8 +1383,7 @@ static int lan9303_register_switch(struct lan9303 *chip)
chip->ds->priv = chip;
chip->ds->ops = &lan9303_switch_ops;
chip->ds->phylink_mac_ops = &lan9303_phylink_mac_ops;
base = chip->phy_addr_base;
chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1 + base, base);
chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1, 0);
return dsa_register_switch(chip->ds);
}

View File

@ -1380,6 +1380,7 @@ static int bcmasp_probe(struct platform_device *pdev)
dev_err(dev, "Cannot create eth interface %d\n", i);
bcmasp_remove_intfs(priv);
of_node_put(intf_node);
ret = -ENOMEM;
goto of_put_exit;
}
list_add_tail(&intf->list, &priv->intfs);

View File

@ -6146,6 +6146,21 @@ static u16 bnxt_get_max_rss_ring(struct bnxt *bp)
return max_ring;
}
u16 bnxt_get_max_rss_ctx_ring(struct bnxt *bp)
{
u16 i, tbl_size, max_ring = 0;
struct bnxt_rss_ctx *rss_ctx;
tbl_size = bnxt_get_rxfh_indir_size(bp->dev);
list_for_each_entry(rss_ctx, &bp->rss_ctx_list, list) {
for (i = 0; i < tbl_size; i++)
max_ring = max(max_ring, rss_ctx->rss_indir_tbl[i]);
}
return max_ring;
}
int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings)
{
if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) {

View File

@ -2776,6 +2776,7 @@ int bnxt_hwrm_vnic_set_tpa(struct bnxt *bp, struct bnxt_vnic_info *vnic,
void bnxt_fill_ipv6_mask(__be32 mask[4]);
int bnxt_alloc_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx);
void bnxt_set_dflt_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx);
u16 bnxt_get_max_rss_ctx_ring(struct bnxt *bp);
int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings);
int bnxt_hwrm_vnic_cfg(struct bnxt *bp, struct bnxt_vnic_info *vnic);
int bnxt_hwrm_vnic_alloc(struct bnxt *bp, struct bnxt_vnic_info *vnic,

View File

@ -961,6 +961,12 @@ static int bnxt_set_channels(struct net_device *dev,
return rc;
}
if (req_rx_rings < bp->rx_nr_rings &&
req_rx_rings <= bnxt_get_max_rss_ctx_ring(bp)) {
netdev_warn(dev, "Can't deactivate rings used by RSS contexts\n");
return -EINVAL;
}
if (bnxt_get_nr_rss_ctxs(bp, req_rx_rings) !=
bnxt_get_nr_rss_ctxs(bp, bp->rx_nr_rings) &&
netif_is_rxfh_configured(dev)) {

View File

@ -1108,6 +1108,46 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link)
return 0;
}
/**
* e1000e_force_smbus - Force interfaces to transition to SMBUS mode.
* @hw: pointer to the HW structure
*
* Force the MAC and the PHY to SMBUS mode. Assumes semaphore already
* acquired.
*
* Return: 0 on success, negative errno on failure.
**/
static s32 e1000e_force_smbus(struct e1000_hw *hw)
{
u16 smb_ctrl = 0;
u32 ctrl_ext;
s32 ret_val;
/* Switching PHY interface always returns MDI error
* so disable retry mechanism to avoid wasting time
*/
e1000e_disable_phy_retry(hw);
/* Force SMBus mode in the PHY */
ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &smb_ctrl);
if (ret_val) {
e1000e_enable_phy_retry(hw);
return ret_val;
}
smb_ctrl |= CV_SMB_CTRL_FORCE_SMBUS;
e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, smb_ctrl);
e1000e_enable_phy_retry(hw);
/* Force SMBus mode in the MAC */
ctrl_ext = er32(CTRL_EXT);
ctrl_ext |= E1000_CTRL_EXT_FORCE_SMBUS;
ew32(CTRL_EXT, ctrl_ext);
return 0;
}
/**
* e1000_enable_ulp_lpt_lp - configure Ultra Low Power mode for LynxPoint-LP
* @hw: pointer to the HW structure
@ -1165,6 +1205,14 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx)
if (ret_val)
goto out;
if (hw->mac.type != e1000_pch_mtp) {
ret_val = e1000e_force_smbus(hw);
if (ret_val) {
e_dbg("Failed to force SMBUS: %d\n", ret_val);
goto release;
}
}
/* Si workaround for ULP entry flow on i127/rev6 h/w. Enable
* LPLU and disable Gig speed when entering ULP
*/
@ -1225,27 +1273,12 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx)
}
release:
/* Switching PHY interface always returns MDI error
* so disable retry mechanism to avoid wasting time
*/
e1000e_disable_phy_retry(hw);
/* Force SMBus mode in PHY */
ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg);
if (ret_val) {
e1000e_enable_phy_retry(hw);
hw->phy.ops.release(hw);
goto out;
if (hw->mac.type == e1000_pch_mtp) {
ret_val = e1000e_force_smbus(hw);
if (ret_val)
e_dbg("Failed to force SMBUS over MTL system: %d\n",
ret_val);
}
phy_reg |= CV_SMB_CTRL_FORCE_SMBUS;
e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg);
e1000e_enable_phy_retry(hw);
/* Force SMBus mode in MAC */
mac_reg = er32(CTRL_EXT);
mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS;
ew32(CTRL_EXT, mac_reg);
hw->phy.ops.release(hw);
out:

View File

@ -13293,6 +13293,10 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog,
bool need_reset;
int i;
/* VSI shall be deleted in a moment, block loading new programs */
if (prog && test_bit(__I40E_IN_REMOVE, pf->state))
return -EINVAL;
/* Don't allow frames that span over multiple buffers */
if (vsi->netdev->mtu > frame_size - I40E_PACKET_HDR_PAD) {
NL_SET_ERR_MSG_MOD(extack, "MTU too large for linear frames and XDP prog does not support frags");
@ -13301,14 +13305,9 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog,
/* When turning XDP on->off/off->on we reset and rebuild the rings. */
need_reset = (i40e_enabled_xdp_vsi(vsi) != !!prog);
if (need_reset)
i40e_prep_for_reset(pf);
/* VSI shall be deleted in a moment, just return EINVAL */
if (test_bit(__I40E_IN_REMOVE, pf->state))
return -EINVAL;
old_prog = xchg(&vsi->xdp_prog, prog);
if (need_reset) {

View File

@ -217,9 +217,9 @@ ltq_etop_free_channel(struct net_device *dev, struct ltq_etop_chan *ch)
if (ch->dma.irq)
free_irq(ch->dma.irq, priv);
if (IS_RX(ch->idx)) {
int desc;
struct ltq_dma_channel *dma = &ch->dma;
for (desc = 0; desc < LTQ_DESC_NUM; desc++)
for (dma->desc = 0; dma->desc < LTQ_DESC_NUM; dma->desc++)
dev_kfree_skb_any(ch->skb[ch->dma.desc]);
}
}

View File

@ -1643,7 +1643,7 @@ static int rvu_check_rsrc_availability(struct rvu *rvu,
if (req->ssow > block->lf.max) {
dev_err(&rvu->pdev->dev,
"Func 0x%x: Invalid SSOW req, %d > max %d\n",
pcifunc, req->sso, block->lf.max);
pcifunc, req->ssow, block->lf.max);
return -EINVAL;
}
mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->addr);

View File

@ -1524,6 +1524,7 @@ static int mtk_star_probe(struct platform_device *pdev)
{
struct device_node *of_node;
struct mtk_star_priv *priv;
struct phy_device *phydev;
struct net_device *ndev;
struct device *dev;
void __iomem *base;
@ -1649,6 +1650,12 @@ static int mtk_star_probe(struct platform_device *pdev)
netif_napi_add(ndev, &priv->rx_napi, mtk_star_rx_poll);
netif_napi_add_tx(ndev, &priv->tx_napi, mtk_star_tx_poll);
phydev = of_phy_find_device(priv->phy_node);
if (phydev) {
phydev->mac_managed_pm = true;
put_device(&phydev->mdio.dev);
}
return devm_register_netdev(dev, ndev);
}

View File

@ -352,11 +352,11 @@ static irqreturn_t ks8851_irq(int irq, void *_ks)
netif_dbg(ks, intr, ks->netdev,
"%s: txspace %d\n", __func__, tx_space);
spin_lock(&ks->statelock);
spin_lock_bh(&ks->statelock);
ks->tx_space = tx_space;
if (netif_queue_stopped(ks->netdev))
netif_wake_queue(ks->netdev);
spin_unlock(&ks->statelock);
spin_unlock_bh(&ks->statelock);
}
if (status & IRQ_SPIBEI) {
@ -482,6 +482,7 @@ static int ks8851_net_open(struct net_device *dev)
ks8851_wrreg16(ks, KS_IER, ks->rc_ier);
ks->queued_len = 0;
ks->tx_space = ks8851_rdreg16(ks, KS_TXMIR);
netif_start_queue(ks->netdev);
netif_dbg(ks, ifup, ks->netdev, "network device up\n");
@ -635,14 +636,14 @@ static void ks8851_set_rx_mode(struct net_device *dev)
/* schedule work to do the actual set of the data if needed */
spin_lock(&ks->statelock);
spin_lock_bh(&ks->statelock);
if (memcmp(&rxctrl, &ks->rxctrl, sizeof(rxctrl)) != 0) {
memcpy(&ks->rxctrl, &rxctrl, sizeof(ks->rxctrl));
schedule_work(&ks->rxctrl_work);
}
spin_unlock(&ks->statelock);
spin_unlock_bh(&ks->statelock);
}
static int ks8851_set_mac_address(struct net_device *dev, void *addr)
@ -1101,7 +1102,6 @@ int ks8851_probe_common(struct net_device *netdev, struct device *dev,
int ret;
ks->netdev = netdev;
ks->tx_space = 6144;
ks->gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
ret = PTR_ERR_OR_ZERO(ks->gpio);

View File

@ -340,10 +340,10 @@ static void ks8851_tx_work(struct work_struct *work)
tx_space = ks8851_rdreg16_spi(ks, KS_TXMIR);
spin_lock(&ks->statelock);
spin_lock_bh(&ks->statelock);
ks->queued_len -= dequeued_len;
ks->tx_space = tx_space;
spin_unlock(&ks->statelock);
spin_unlock_bh(&ks->statelock);
ks8851_unlock_spi(ks, &flags);
}

View File

@ -748,7 +748,7 @@ static int lan87xx_cable_test_report(struct phy_device *phydev)
ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A,
lan87xx_cable_test_report_trans(detect));
return 0;
return phy_init_hw(phydev);
}
static int lan87xx_cable_test_get_status(struct phy_device *phydev,

View File

@ -70,6 +70,7 @@
#define MPHDRLEN_SSN 4 /* ditto with short sequence numbers */
#define PPP_PROTO_LEN 2
#define PPP_LCP_HDRLEN 4
/*
* An instance of /dev/ppp can be associated with either a ppp
@ -493,6 +494,15 @@ static ssize_t ppp_read(struct file *file, char __user *buf,
return ret;
}
static bool ppp_check_packet(struct sk_buff *skb, size_t count)
{
/* LCP packets must include LCP header which 4 bytes long:
* 1-byte code, 1-byte identifier, and 2-byte length.
*/
return get_unaligned_be16(skb->data) != PPP_LCP ||
count >= PPP_PROTO_LEN + PPP_LCP_HDRLEN;
}
static ssize_t ppp_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@ -515,6 +525,11 @@ static ssize_t ppp_write(struct file *file, const char __user *buf,
kfree_skb(skb);
goto out;
}
ret = -EINVAL;
if (unlikely(!ppp_check_packet(skb, count))) {
kfree_skb(skb);
goto out;
}
switch (pf->kind) {
case INTERFACE:

View File

@ -15,8 +15,8 @@ static void swap_endian(u8 *dst, const u8 *src, u8 bits)
if (bits == 32) {
*(u32 *)dst = be32_to_cpu(*(const __be32 *)src);
} else if (bits == 128) {
((u64 *)dst)[0] = be64_to_cpu(((const __be64 *)src)[0]);
((u64 *)dst)[1] = be64_to_cpu(((const __be64 *)src)[1]);
((u64 *)dst)[0] = get_unaligned_be64(src);
((u64 *)dst)[1] = get_unaligned_be64(src + 8);
}
}

View File

@ -124,10 +124,10 @@ static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id)
*/
static inline int wg_cpumask_next_online(int *last_cpu)
{
int cpu = cpumask_next(*last_cpu, cpu_online_mask);
int cpu = cpumask_next(READ_ONCE(*last_cpu), cpu_online_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(cpu_online_mask);
*last_cpu = cpu;
WRITE_ONCE(*last_cpu, cpu);
return cpu;
}

View File

@ -222,7 +222,7 @@ void wg_packet_send_keepalive(struct wg_peer *peer)
{
struct sk_buff *skb;
if (skb_queue_empty(&peer->staged_packet_queue)) {
if (skb_queue_empty_lockless(&peer->staged_packet_queue)) {
skb = alloc_skb(DATA_PACKET_HEAD_ROOM + MESSAGE_MINIMUM_LENGTH,
GFP_ATOMIC);
if (unlikely(!skb))

View File

@ -13,7 +13,7 @@ struct mini_Qdisc;
struct tcx_entry {
struct mini_Qdisc __rcu *miniq;
struct bpf_mprog_bundle bundle;
bool miniq_active;
u32 miniq_active;
struct rcu_head rcu;
};
@ -125,11 +125,16 @@ static inline void tcx_skeys_dec(bool ingress)
tcx_dec();
}
static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry,
const bool active)
static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry)
{
ASSERT_RTNL();
tcx_entry(entry)->miniq_active = active;
tcx_entry(entry)->miniq_active++;
}
static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry)
{
ASSERT_RTNL();
tcx_entry(entry)->miniq_active--;
}
static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)

View File

@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
nbuckets, GFP_USER | __GFP_NOWARN);
smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
err = -ENOMEM;
goto free_smap;

View File

@ -1084,7 +1084,10 @@ struct bpf_async_cb {
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
struct rcu_head rcu;
union {
struct rcu_head rcu;
struct work_struct delete_work;
};
u64 flags;
};
@ -1107,6 +1110,7 @@ struct bpf_async_cb {
struct bpf_hrtimer {
struct bpf_async_cb cb;
struct hrtimer timer;
atomic_t cancelling;
};
struct bpf_work {
@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work)
kfree_rcu(w, cb.rcu);
}
static void bpf_timer_delete_work(struct work_struct *work)
{
struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call
* kfree_rcu(t) right after for both preallocated and non-preallocated
* maps. The async->cb = NULL was already done and no code path can see
* address 't' anymore. Timer if armed for existing bpf_hrtimer before
* bpf_timer_cancel_and_free will have been cancelled.
*/
hrtimer_cancel(&t->timer);
kfree_rcu(t, cb.rcu);
}
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
enum bpf_async_type type)
{
@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
clockid = flags & (MAX_CLOCKS - 1);
t = (struct bpf_hrtimer *)cb;
atomic_set(&t->cancelling, 0);
INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
t->timer.function = bpf_timer_cb;
cb->value = (void *)async - map->record->timer_off;
@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async)
BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
struct bpf_hrtimer *t;
struct bpf_hrtimer *t, *cur_t;
bool inc = false;
int ret = 0;
if (in_nmi())
@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
ret = -EINVAL;
goto out;
}
if (this_cpu_read(hrtimer_running) == t) {
cur_t = this_cpu_read(hrtimer_running);
if (cur_t == t) {
/* If bpf callback_fn is trying to bpf_timer_cancel()
* its own timer the hrtimer_cancel() will deadlock
* since it waits for callback_fn to finish
* since it waits for callback_fn to finish.
*/
ret = -EDEADLK;
goto out;
}
/* Only account in-flight cancellations when invoked from a timer
* callback, since we want to avoid waiting only if other _callbacks_
* are waiting on us, to avoid introducing lockups. Non-callback paths
* are ok, since nobody would synchronously wait for their completion.
*/
if (!cur_t)
goto drop;
atomic_inc(&t->cancelling);
/* Need full barrier after relaxed atomic_inc */
smp_mb__after_atomic();
inc = true;
if (atomic_read(&cur_t->cancelling)) {
/* We're cancelling timer t, while some other timer callback is
* attempting to cancel us. In such a case, it might be possible
* that timer t belongs to the other callback, or some other
* callback waiting upon it (creating transitive dependencies
* upon us), and we will enter a deadlock if we continue
* cancelling and waiting for it synchronously, since it might
* do the same. Bail!
*/
ret = -EDEADLK;
goto out;
}
drop:
drop_prog_refcnt(&t->cb);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
@ -1467,6 +1516,8 @@ out:
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
if (inc)
atomic_dec(&t->cancelling);
rcu_read_unlock();
return ret;
}
@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val)
if (!t)
return;
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call kfree(t)
* right after for both preallocated and non-preallocated maps.
* The async->cb = NULL was already done and no code path can
* see address 't' anymore.
*
* Check that bpf_map_delete/update_elem() wasn't called from timer
* callback_fn. In such case don't call hrtimer_cancel() (since it will
* deadlock) and don't call hrtimer_try_to_cancel() (since it will just
* return -1). Though callback_fn is still running on this cpu it's
/* We check that bpf_map_delete/update_elem() was called from timer
* callback_fn. In such case we don't call hrtimer_cancel() (since it
* will deadlock) and don't call hrtimer_try_to_cancel() (since it will
* just return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
* since async->cb = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
*
* However, it is possible the timer callback_fn calling us armed the
* timer _before_ calling us, such that failing to cancel it here will
* cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
* Therefore, we _need_ to cancel any outstanding timers before we do
* kfree_rcu, even though no more timers can be armed.
*
* Moreover, we need to schedule work even if timer does not belong to
* the calling callback_fn, as on two different CPUs, we can end up in a
* situation where both sides run in parallel, try to cancel one
* another, and we end up waiting on both sides in hrtimer_cancel
* without making forward progress, since timer1 depends on time2
* callback to finish, and vice versa.
*
* CPU 1 (timer1_cb) CPU 2 (timer2_cb)
* bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
*
* To avoid these issues, punt to workqueue context when we are in a
* timer callback.
*/
if (this_cpu_read(hrtimer_running) != t)
hrtimer_cancel(&t->timer);
kfree_rcu(t, cb.rcu);
if (this_cpu_read(hrtimer_running))
queue_work(system_unbound_wq, &t->cb.delete_work);
else
bpf_timer_delete_work(&t->cb.delete_work);
}
/* This function is called by map_delete/update_elem for individual element and

View File

@ -423,11 +423,12 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
n = 0;
skb_frag_foreach_page(frag,
skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_local_page(p);
n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
n += INDIRECT_CALL_1(cb, simple_copy_to_iter,
vaddr + p_off, p_len, data, to);
kunmap_local(vaddr);
}

View File

@ -434,7 +434,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
page = sg_page(sge);
if (copied + copy > len)
copy = len - copied;
copy = copy_page_to_iter(page, sge->offset, copy, iter);
if (copy)
copy = copy_page_to_iter(page, sge->offset, copy, iter);
if (!copy) {
copied = copied ? copied : -EFAULT;
goto out;

View File

@ -37,6 +37,8 @@ static int linkstate_get_sqi(struct net_device *dev)
mutex_lock(&phydev->lock);
if (!phydev->drv || !phydev->drv->get_sqi)
ret = -EOPNOTSUPP;
else if (!phydev->link)
ret = -ENETDOWN;
else
ret = phydev->drv->get_sqi(phydev);
mutex_unlock(&phydev->lock);
@ -55,6 +57,8 @@ static int linkstate_get_sqi_max(struct net_device *dev)
mutex_lock(&phydev->lock);
if (!phydev->drv || !phydev->drv->get_sqi_max)
ret = -EOPNOTSUPP;
else if (!phydev->link)
ret = -ENETDOWN;
else
ret = phydev->drv->get_sqi_max(phydev);
mutex_unlock(&phydev->lock);
@ -62,6 +66,17 @@ static int linkstate_get_sqi_max(struct net_device *dev)
return ret;
};
static bool linkstate_sqi_critical_error(int sqi)
{
return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN;
}
static bool linkstate_sqi_valid(struct linkstate_reply_data *data)
{
return data->sqi >= 0 && data->sqi_max >= 0 &&
data->sqi <= data->sqi_max;
}
static int linkstate_get_link_ext_state(struct net_device *dev,
struct linkstate_reply_data *data)
{
@ -93,12 +108,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
data->link = __ethtool_get_link(dev);
ret = linkstate_get_sqi(dev);
if (ret < 0 && ret != -EOPNOTSUPP)
if (linkstate_sqi_critical_error(ret))
goto out;
data->sqi = ret;
ret = linkstate_get_sqi_max(dev);
if (ret < 0 && ret != -EOPNOTSUPP)
if (linkstate_sqi_critical_error(ret))
goto out;
data->sqi_max = ret;
@ -136,11 +151,10 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base,
len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */
+ 0;
if (data->sqi != -EOPNOTSUPP)
len += nla_total_size(sizeof(u32));
if (data->sqi_max != -EOPNOTSUPP)
len += nla_total_size(sizeof(u32));
if (linkstate_sqi_valid(data)) {
len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */
len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */
}
if (data->link_ext_state_provided)
len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */
@ -164,13 +178,14 @@ static int linkstate_fill_reply(struct sk_buff *skb,
nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link))
return -EMSGSIZE;
if (data->sqi != -EOPNOTSUPP &&
nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi))
return -EMSGSIZE;
if (linkstate_sqi_valid(data)) {
if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi))
return -EMSGSIZE;
if (data->sqi_max != -EOPNOTSUPP &&
nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max))
return -EMSGSIZE;
if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX,
data->sqi_max))
return -EMSGSIZE;
}
if (data->link_ext_state_provided) {
if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE,

View File

@ -2129,8 +2129,16 @@ void tcp_clear_retrans(struct tcp_sock *tp)
static inline void tcp_init_undo(struct tcp_sock *tp)
{
tp->undo_marker = tp->snd_una;
/* Retransmission still in flight may cause DSACKs later. */
tp->undo_retrans = tp->retrans_out ? : -1;
/* First, account for regular retransmits in flight: */
tp->undo_retrans = tp->retrans_out;
/* Next, account for TLP retransmits in flight: */
if (tp->tlp_high_seq && tp->tlp_retrans)
tp->undo_retrans++;
/* Finally, avoid 0, because undo_retrans==0 means "can undo now": */
if (!tp->undo_retrans)
tp->undo_retrans = -1;
}
static bool tcp_is_rack(const struct sock *sk)
@ -2209,6 +2217,7 @@ void tcp_enter_loss(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
tp->tlp_high_seq = 0;
tcp_ecn_queue_cwr(tp);
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous

View File

@ -483,15 +483,26 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
const struct sk_buff *skb,
u32 rtx_delta)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
const struct tcp_sock *tp = tcp_sk(sk);
const int timeout = TCP_RTO_MAX * 2;
int timeout = TCP_RTO_MAX * 2;
s32 rcv_delta;
if (user_timeout) {
/* If user application specified a TCP_USER_TIMEOUT,
* it does not want win 0 packets to 'reset the timer'
* while retransmits are not making progress.
*/
if (rtx_delta > user_timeout)
return true;
timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout));
}
/* Note: timer interrupt might have been delayed by at least one jiffy,
* and tp->rcv_tstamp might very well have been written recently.
* rcv_delta can thus be negative.
*/
rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp;
if (rcv_delta <= timeout)
return false;
@ -536,8 +547,6 @@ void tcp_retransmit_timer(struct sock *sk)
if (WARN_ON_ONCE(!skb))
return;
tp->tlp_high_seq = 0;
if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
!((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
/* Receiver dastardly shrinks window. Our retransmits

View File

@ -326,6 +326,8 @@ found:
goto fail_unlock;
}
sock_set_flag(sk, SOCK_RCU_FREE);
sk_add_node_rcu(sk, &hslot->head);
hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@ -342,7 +344,7 @@ found:
hslot2->count++;
spin_unlock(&hslot2->lock);
}
sock_set_flag(sk, SOCK_RCU_FREE);
error = 0;
fail_unlock:
spin_unlock_bh(&hslot->lock);

View File

@ -3823,6 +3823,15 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r
nf_tables_rule_destroy(ctx, rule);
}
/** nft_chain_validate - loop detection and hook validation
*
* @ctx: context containing call depth and base chain
* @chain: chain to validate
*
* Walk through the rules of the given chain and chase all jumps/gotos
* and set lookups until either the jump limit is hit or all reachable
* chains have been validated.
*/
int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
{
struct nft_expr *expr, *last;
@ -3844,6 +3853,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
if (!expr->ops->validate)
continue;
/* This may call nft_chain_validate() recursively,
* callers that do so must increment ctx->level.
*/
err = expr->ops->validate(ctx, expr, &data);
if (err < 0)
return err;
@ -10809,150 +10821,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain,
}
EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);
/*
* Loop detection - walk through the ruleset beginning at the destination chain
* of a new jump until either the source chain is reached (loop) or all
* reachable chains have been traversed.
*
* The loop check is performed whenever a new jump verdict is added to an
* expression or verdict map or a verdict map is bound to a new chain.
*/
static int nf_tables_check_loops(const struct nft_ctx *ctx,
const struct nft_chain *chain);
static int nft_check_loops(const struct nft_ctx *ctx,
const struct nft_set_ext *ext)
{
const struct nft_data *data;
int ret;
data = nft_set_ext_data(ext);
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
ret = nf_tables_check_loops(ctx, data->verdict.chain);
break;
default:
ret = 0;
break;
}
return ret;
}
static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
struct nft_elem_priv *elem_priv)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
if (!nft_set_elem_active(ext, iter->genmask))
return 0;
if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
*nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
return 0;
return nft_check_loops(ctx, ext);
}
static int nft_set_catchall_loops(const struct nft_ctx *ctx,
struct nft_set *set)
{
u8 genmask = nft_genmask_next(ctx->net);
struct nft_set_elem_catchall *catchall;
struct nft_set_ext *ext;
int ret = 0;
list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
if (!nft_set_elem_active(ext, genmask))
continue;
ret = nft_check_loops(ctx, ext);
if (ret < 0)
return ret;
}
return ret;
}
static int nf_tables_check_loops(const struct nft_ctx *ctx,
const struct nft_chain *chain)
{
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
struct nft_set *set;
struct nft_set_binding *binding;
struct nft_set_iter iter;
if (ctx->chain == chain)
return -ELOOP;
if (fatal_signal_pending(current))
return -EINTR;
list_for_each_entry(rule, &chain->rules, list) {
nft_rule_for_each_expr(expr, last, rule) {
struct nft_immediate_expr *priv;
const struct nft_data *data;
int err;
if (strcmp(expr->ops->type->name, "immediate"))
continue;
priv = nft_expr_priv(expr);
if (priv->dreg != NFT_REG_VERDICT)
continue;
data = &priv->data;
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
err = nf_tables_check_loops(ctx,
data->verdict.chain);
if (err < 0)
return err;
break;
default:
break;
}
}
}
list_for_each_entry(set, &ctx->table->sets, list) {
if (!nft_is_active_next(ctx->net, set))
continue;
if (!(set->flags & NFT_SET_MAP) ||
set->dtype != NFT_DATA_VERDICT)
continue;
list_for_each_entry(binding, &set->bindings, list) {
if (!(binding->flags & NFT_SET_MAP) ||
binding->chain != chain)
continue;
iter.genmask = nft_genmask_next(ctx->net);
iter.type = NFT_ITER_UPDATE;
iter.skip = 0;
iter.count = 0;
iter.err = 0;
iter.fn = nf_tables_loop_check_setelem;
set->ops->walk(ctx, set, &iter);
if (!iter.err)
iter.err = nft_set_catchall_loops(ctx, set);
if (iter.err < 0)
return iter.err;
}
}
return 0;
}
/**
* nft_parse_u32_check - fetch u32 attribute and check for maximum value
*
@ -11065,7 +10933,7 @@ static int nft_validate_register_store(const struct nft_ctx *ctx,
if (data != NULL &&
(data->verdict.code == NFT_GOTO ||
data->verdict.code == NFT_JUMP)) {
err = nf_tables_check_loops(ctx, data->verdict.chain);
err = nft_chain_validate(ctx, data->verdict.chain);
if (err < 0)
return err;
}

View File

@ -325,7 +325,7 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
hooks = nf_hook_entries_head(net, pf, entry->state.hook);
i = entry->hook_index;
if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
if (!hooks || i >= hooks->num_hook_entries) {
kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP);
nf_queue_entry_free(entry);
return;

View File

@ -1077,6 +1077,14 @@ do_nat:
*/
if (nf_conntrack_confirm(skb) != NF_ACCEPT)
goto drop;
/* The ct may be dropped if a clash has been resolved,
* so it's necessary to retrieve it from skb again to
* prevent UAF.
*/
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
skip_add = true;
}
if (!skip_add)

View File

@ -91,7 +91,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
entry = tcx_entry_fetch_or_create(dev, true, &created);
if (!entry)
return -ENOMEM;
tcx_miniq_set_active(entry, true);
tcx_miniq_inc(entry);
mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq);
if (created)
tcx_entry_update(dev, entry, true);
@ -121,7 +121,7 @@ static void ingress_destroy(struct Qdisc *sch)
tcf_block_put_ext(q->block, sch, &q->block_info);
if (entry) {
tcx_miniq_set_active(entry, false);
tcx_miniq_dec(entry);
if (!tcx_entry_is_active(entry)) {
tcx_entry_update(dev, NULL, true);
tcx_entry_free(entry);
@ -257,7 +257,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
entry = tcx_entry_fetch_or_create(dev, true, &created);
if (!entry)
return -ENOMEM;
tcx_miniq_set_active(entry, true);
tcx_miniq_inc(entry);
mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq);
if (created)
tcx_entry_update(dev, entry, true);
@ -276,7 +276,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
entry = tcx_entry_fetch_or_create(dev, false, &created);
if (!entry)
return -ENOMEM;
tcx_miniq_set_active(entry, true);
tcx_miniq_inc(entry);
mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq);
if (created)
tcx_entry_update(dev, entry, false);
@ -302,7 +302,7 @@ static void clsact_destroy(struct Qdisc *sch)
tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
if (ingress_entry) {
tcx_miniq_set_active(ingress_entry, false);
tcx_miniq_dec(ingress_entry);
if (!tcx_entry_is_active(ingress_entry)) {
tcx_entry_update(dev, NULL, true);
tcx_entry_free(ingress_entry);
@ -310,7 +310,7 @@ static void clsact_destroy(struct Qdisc *sch)
}
if (egress_entry) {
tcx_miniq_set_active(egress_entry, false);
tcx_miniq_dec(egress_entry);
if (!tcx_entry_is_active(egress_entry)) {
tcx_entry_update(dev, NULL, false);
tcx_entry_free(egress_entry);

View File

@ -2441,6 +2441,13 @@ static void xs_tcp_setup_socket(struct work_struct *work)
transport->srcport = 0;
status = -EAGAIN;
break;
case -EPERM:
/* Happens, for instance, if a BPF program is preventing
* the connect. Remap the error so upper layers can better
* deal with it.
*/
status = -ECONNREFUSED;
fallthrough;
case -EINVAL:
/* Happens, for instance, if the user specified a link
* local IPv6 address without a scope-id.

View File

@ -58,9 +58,12 @@ CONFIG_MPLS=y
CONFIG_MPLS_IPTUNNEL=y
CONFIG_MPLS_ROUTING=y
CONFIG_MPTCP=y
CONFIG_NET_ACT_SKBMOD=y
CONFIG_NET_CLS=y
CONFIG_NET_CLS_ACT=y
CONFIG_NET_CLS_BPF=y
CONFIG_NET_CLS_FLOWER=y
CONFIG_NET_CLS_MATCHALL=y
CONFIG_NET_FOU=y
CONFIG_NET_FOU_IP_TUNNELS=y
CONFIG_NET_IPGRE=y

View File

@ -9,6 +9,8 @@
#define ping_cmd "ping -q -c1 -w1 127.0.0.1 > /dev/null"
#include "test_tc_link.skel.h"
#include "netlink_helpers.h"
#include "tc_helpers.h"
void serial_test_tc_links_basic(void)
@ -1787,6 +1789,65 @@ void serial_test_tc_links_ingress(void)
test_tc_links_ingress(BPF_TCX_INGRESS, false, false);
}
struct qdisc_req {
struct nlmsghdr n;
struct tcmsg t;
char buf[1024];
};
static int qdisc_replace(int ifindex, const char *kind, bool block)
{
struct rtnl_handle rth = { .fd = -1 };
struct qdisc_req req;
int err;
err = rtnl_open(&rth, 0);
if (!ASSERT_OK(err, "open_rtnetlink"))
return err;
memset(&req, 0, sizeof(req));
req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
req.n.nlmsg_flags = NLM_F_CREATE | NLM_F_REPLACE | NLM_F_REQUEST;
req.n.nlmsg_type = RTM_NEWQDISC;
req.t.tcm_family = AF_UNSPEC;
req.t.tcm_ifindex = ifindex;
req.t.tcm_parent = 0xfffffff1;
addattr_l(&req.n, sizeof(req), TCA_KIND, kind, strlen(kind) + 1);
if (block)
addattr32(&req.n, sizeof(req), TCA_INGRESS_BLOCK, 1);
err = rtnl_talk(&rth, &req.n, NULL);
ASSERT_OK(err, "talk_rtnetlink");
rtnl_close(&rth);
return err;
}
void serial_test_tc_links_dev_chain0(void)
{
int err, ifindex;
ASSERT_OK(system("ip link add dev foo type veth peer name bar"), "add veth");
ifindex = if_nametoindex("foo");
ASSERT_NEQ(ifindex, 0, "non_zero_ifindex");
err = qdisc_replace(ifindex, "ingress", true);
if (!ASSERT_OK(err, "attaching ingress"))
goto cleanup;
ASSERT_OK(system("tc filter add block 1 matchall action skbmod swap mac"), "add block");
err = qdisc_replace(ifindex, "clsact", false);
if (!ASSERT_OK(err, "attaching clsact"))
goto cleanup;
/* Heuristic: kern_sync_rcu() alone does not work; a wait-time of ~5s
* triggered the issue without the fix reliably 100% of the time.
*/
sleep(5);
ASSERT_OK(system("tc filter add dev foo ingress matchall action skbmod swap mac"), "add filter");
cleanup:
ASSERT_OK(system("ip link del dev foo"), "del veth");
ASSERT_EQ(if_nametoindex("foo"), 0, "foo removed");
ASSERT_EQ(if_nametoindex("bar"), 0, "bar removed");
}
static void test_tc_links_dev_mixed(int target)
{
LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1);

View File

@ -0,0 +1,91 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <sched.h>
#include <test_progs.h>
#include <pthread.h>
#include <network_helpers.h>
#include "timer_lockup.skel.h"
static long cpu;
static int *timer1_err;
static int *timer2_err;
static bool skip;
volatile int k = 0;
static void *timer_lockup_thread(void *arg)
{
LIBBPF_OPTS(bpf_test_run_opts, opts,
.data_in = &pkt_v4,
.data_size_in = sizeof(pkt_v4),
.repeat = 1000,
);
int i, prog_fd = *(int *)arg;
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset);
ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset),
&cpuset),
"cpu affinity");
for (i = 0; !READ_ONCE(*timer1_err) && !READ_ONCE(*timer2_err); i++) {
bpf_prog_test_run_opts(prog_fd, &opts);
/* Skip the test if we can't reproduce the race in a reasonable
* amount of time.
*/
if (i > 50) {
WRITE_ONCE(skip, true);
break;
}
}
return NULL;
}
void test_timer_lockup(void)
{
int timer1_prog, timer2_prog;
struct timer_lockup *skel;
pthread_t thrds[2];
void *ret;
skel = timer_lockup__open_and_load();
if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load"))
return;
timer1_prog = bpf_program__fd(skel->progs.timer1_prog);
timer2_prog = bpf_program__fd(skel->progs.timer2_prog);
timer1_err = &skel->bss->timer1_err;
timer2_err = &skel->bss->timer2_err;
if (!ASSERT_OK(pthread_create(&thrds[0], NULL, timer_lockup_thread,
&timer1_prog),
"pthread_create thread1"))
goto out;
if (!ASSERT_OK(pthread_create(&thrds[1], NULL, timer_lockup_thread,
&timer2_prog),
"pthread_create thread2")) {
pthread_exit(&thrds[0]);
goto out;
}
pthread_join(thrds[1], &ret);
pthread_join(thrds[0], &ret);
if (skip) {
test__skip();
goto out;
}
if (*timer1_err != -EDEADLK && *timer1_err != 0)
ASSERT_FAIL("timer1_err bad value");
if (*timer2_err != -EDEADLK && *timer2_err != 0)
ASSERT_FAIL("timer2_err bad value");
out:
timer_lockup__destroy(skel);
return;
}

View File

@ -0,0 +1,87 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <time.h>
#include <errno.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_misc.h"
char _license[] SEC("license") = "GPL";
struct elem {
struct bpf_timer t;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, int);
__type(value, struct elem);
} timer1_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, int);
__type(value, struct elem);
} timer2_map SEC(".maps");
int timer1_err;
int timer2_err;
static int timer_cb1(void *map, int *k, struct elem *v)
{
struct bpf_timer *timer;
int key = 0;
timer = bpf_map_lookup_elem(&timer2_map, &key);
if (timer)
timer2_err = bpf_timer_cancel(timer);
return 0;
}
static int timer_cb2(void *map, int *k, struct elem *v)
{
struct bpf_timer *timer;
int key = 0;
timer = bpf_map_lookup_elem(&timer1_map, &key);
if (timer)
timer1_err = bpf_timer_cancel(timer);
return 0;
}
SEC("tc")
int timer1_prog(void *ctx)
{
struct bpf_timer *timer;
int key = 0;
timer = bpf_map_lookup_elem(&timer1_map, &key);
if (timer) {
bpf_timer_init(timer, &timer1_map, CLOCK_BOOTTIME);
bpf_timer_set_callback(timer, timer_cb1);
bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN);
}
return 0;
}
SEC("tc")
int timer2_prog(void *ctx)
{
struct bpf_timer *timer;
int key = 0;
timer = bpf_map_lookup_elem(&timer2_map, &key);
if (timer) {
bpf_timer_init(timer, &timer2_map, CLOCK_BOOTTIME);
bpf_timer_set_callback(timer, timer_cb2);
bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN);
}
return 0;
}

View File

@ -109,9 +109,9 @@ KERNEL_ARCH := x86_64
KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
QEMU_VPORT_RESULT := virtio-serial-device
ifeq ($(HOST_ARCH),$(ARCH))
QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi
QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off,acpi=off
else
QEMU_MACHINE := -cpu max -machine microvm -no-acpi
QEMU_MACHINE := -cpu max -machine microvm,acpi=off
endif
else ifeq ($(ARCH),i686)
CHOST := i686-linux-musl
@ -120,9 +120,9 @@ KERNEL_ARCH := x86
KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
QEMU_VPORT_RESULT := virtio-serial-device
ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH))
QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi
QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off,acpi=off
else
QEMU_MACHINE := -cpu coreduo -machine microvm -no-acpi
QEMU_MACHINE := -cpu coreduo -machine microvm,acpi=off
endif
else ifeq ($(ARCH),mips64)
CHOST := mips64-linux-musl