linux/include/net/ip6_tunnel.h
Peilin Ye 31c417c948 ip_gre, ip6_gre: Fix race condition on o_seqno in collect_md mode
As pointed out by Jakub Kicinski, currently using TUNNEL_SEQ in
collect_md mode is racy for [IP6]GRE[TAP] devices.  Consider the
following sequence of events:

1. An [IP6]GRE[TAP] device is created in collect_md mode using "ip link
   add ... external".  "ip" ignores "[o]seq" if "external" is specified,
   so TUNNEL_SEQ is off, and the device is marked as NETIF_F_LLTX (i.e.
   it uses lockless TX);
2. Someone sets TUNNEL_SEQ on outgoing skb's, using e.g.
   bpf_skb_set_tunnel_key() in an eBPF program attached to this device;
3. gre_fb_xmit() or __gre6_xmit() processes these skb's:

	gre_build_header(skb, tun_hlen,
			 flags, protocol,
			 tunnel_id_to_key32(tun_info->key.tun_id),
			 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
					      : 0);   ^^^^^^^^^^^^^^^^^

Since we are not using the TX lock (&txq->_xmit_lock), multiple CPUs may
try to do this tunnel->o_seqno++ in parallel, which is racy.  Fix it by
making o_seqno atomic_t.

As mentioned by Eric Dumazet in commit b790e01aee ("ip_gre: lockless
xmit"), making o_seqno atomic_t increases "chance for packets being out
of order at receiver" when NETIF_F_LLTX is on.

Maybe a better fix would be:

1. Do not ignore "oseq" in external mode.  Users MUST specify "oseq" if
   they want the kernel to allow sequencing of outgoing packets;
2. Reject all outgoing TUNNEL_SEQ packets if the device was not created
   with "oseq".

Unfortunately, that would break userspace.

We could now make [IP6]GRE[TAP] devices always NETIF_F_LLTX, but let us
do it in separate patches to keep this fix minimal.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Fixes: 77a5196a80 ("gre: add sequence number for collect md mode.")
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-25 11:40:45 +01:00

171 lines
5.0 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_IP6_TUNNEL_H
#define _NET_IP6_TUNNEL_H
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <linux/if_tunnel.h>
#include <linux/ip6_tunnel.h>
#include <net/ip_tunnels.h>
#include <net/dst_cache.h>
#define IP6TUNNEL_ERR_TIMEO (30*HZ)
/* capable of sending packets */
#define IP6_TNL_F_CAP_XMIT 0x10000
/* capable of receiving packets */
#define IP6_TNL_F_CAP_RCV 0x20000
/* determine capability on a per-packet basis */
#define IP6_TNL_F_CAP_PER_PACKET 0x40000
struct __ip6_tnl_parm {
char name[IFNAMSIZ]; /* name of tunnel device */
int link; /* ifindex of underlying L2 interface */
__u8 proto; /* tunnel protocol */
__u8 encap_limit; /* encapsulation limit for tunnel */
__u8 hop_limit; /* hop limit for tunnel */
bool collect_md;
__be32 flowinfo; /* traffic class and flowlabel for tunnel */
__u32 flags; /* tunnel flags */
struct in6_addr laddr; /* local tunnel end-point address */
struct in6_addr raddr; /* remote tunnel end-point address */
__be16 i_flags;
__be16 o_flags;
__be32 i_key;
__be32 o_key;
__u32 fwmark;
__u32 index; /* ERSPAN type II index */
__u8 erspan_ver; /* ERSPAN version */
__u8 dir; /* direction */
__u16 hwid; /* hwid */
};
/* IPv6 tunnel */
struct ip6_tnl {
struct ip6_tnl __rcu *next; /* next tunnel in list */
struct net_device *dev; /* virtual device associated with tunnel */
netdevice_tracker dev_tracker;
struct net *net; /* netns for packet i/o */
struct __ip6_tnl_parm parms; /* tunnel configuration parameters */
struct flowi fl; /* flowi template for xmit */
struct dst_cache dst_cache; /* cached dst */
struct gro_cells gro_cells;
int err_count;
unsigned long err_time;
/* These fields used only by GRE */
__u32 i_seqno; /* The last seen seqno */
atomic_t o_seqno; /* The last output seqno */
int hlen; /* tun_hlen + encap_hlen */
int tun_hlen; /* Precalculated header length */
int encap_hlen; /* Encap header length (FOU,GUE) */
struct ip_tunnel_encap encap;
int mlink;
};
struct ip6_tnl_encap_ops {
size_t (*encap_hlen)(struct ip_tunnel_encap *e);
int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
u8 *protocol, struct flowi6 *fl6);
int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info);
};
#ifdef CONFIG_INET
extern const struct ip6_tnl_encap_ops __rcu *
ip6tun_encaps[MAX_IPTUN_ENCAP_OPS];
int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops,
unsigned int num);
int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops,
unsigned int num);
int ip6_tnl_encap_setup(struct ip6_tnl *t,
struct ip_tunnel_encap *ipencap);
static inline int ip6_encap_hlen(struct ip_tunnel_encap *e)
{
const struct ip6_tnl_encap_ops *ops;
int hlen = -EINVAL;
if (e->type == TUNNEL_ENCAP_NONE)
return 0;
if (e->type >= MAX_IPTUN_ENCAP_OPS)
return -EINVAL;
rcu_read_lock();
ops = rcu_dereference(ip6tun_encaps[e->type]);
if (likely(ops && ops->encap_hlen))
hlen = ops->encap_hlen(e);
rcu_read_unlock();
return hlen;
}
static inline int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t,
u8 *protocol, struct flowi6 *fl6)
{
const struct ip6_tnl_encap_ops *ops;
int ret = -EINVAL;
if (t->encap.type == TUNNEL_ENCAP_NONE)
return 0;
if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
return -EINVAL;
rcu_read_lock();
ops = rcu_dereference(ip6tun_encaps[t->encap.type]);
if (likely(ops && ops->build_header))
ret = ops->build_header(skb, &t->encap, protocol, fl6);
rcu_read_unlock();
return ret;
}
/* Tunnel encapsulation limit destination sub-option */
struct ipv6_tlv_tnl_enc_lim {
__u8 type; /* type-code for option */
__u8 length; /* option length */
__u8 encap_limit; /* tunnel encapsulation limit */
} __packed;
int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
const struct in6_addr *raddr);
int ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
bool log_ecn_error);
int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
const struct in6_addr *raddr);
int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto);
__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw);
__u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr,
const struct in6_addr *raddr);
struct net *ip6_tnl_get_link_net(const struct net_device *dev);
int ip6_tnl_get_iflink(const struct net_device *dev);
int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu);
static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
struct net_device *dev)
{
int pkt_len, err;
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
pkt_len = skb->len - skb_inner_network_offset(skb);
err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
if (dev) {
if (unlikely(net_xmit_eval(err)))
pkt_len = -1;
iptunnel_xmit_stats(dev, pkt_len);
}
}
#endif
#endif