Merge branch 'rco_correctness'

Tom Herbert says:

====================
net: Fixes to remote checksum offload and CHECKSUM_PARTIAL

This patch set fixes a correctness problem with remote checksum
offload, clarifies the meaning of CHECKSUM_PARTIAL, and allows
remote checksum offload to set CHECKSUM_PARTIAL instead of
calling csum_partial and modifying the checksum.

Specifically:
  - In the GRO remote checksum path, restore the checksum after
    calling lower layer GRO functions. This is needed if the
    packet is forwarded off host with the Remote Checksum Offload
    option still present.
  - Clarify meaning of CHECKSUM PARTIAL in the receive path. Only
    the checksums referred to by checksum partial and any preceding
    checksums can be considered verified.
  - Fixes to UDP tunnel GRO complete. Need to set SKB_GSO_UDP_TUNNEL_*,
    SKB_GSO_TUNNEL_REMCSUM, and skb->encapsulation for forwarding
    case.
  - Infrastructure to allow setting of CHECKSUM_PARTIAL in remote
    checksum offload. This a potential performance benefit instead
    of calling csum_partial (potentially twice, once in GRO path
    and once in normal path). The downside of using CHECKSUM_PARTIAL
    and not actually writing the checksum is that we aren't verifying
    that the sender correctly wrote the pseudo checksum into the
    checksum field, or that the start/offset values actually point
    to a checksum. If the sender did not set up these fields correctly,
    a packet might be accepted locally, but not accepted by a peer
    when the packet is forwarded off host. Verifying these fields
    seems non-trivial, and because the fields can only be incorrect
    due to sender error and not corruption (outer checksum protects
    against that) we'll make use of CHECKSUM_PARTIAL the default. This
    behavior can be reverted as an netlink option on the encapsulation
    socket.
  - Change VXLAN and GUE to set CHECKSUM_PARTIAL in remote checksum
    offload by default, configuration hooks can revert to using
    csum_partial.

Testing:

I ran performance numbers using netperf TCP_STREAM and TCP_RR with 200
streams for GRE/GUE and for VXLAN. This compares before the fixes,
the fixes with not setting checksum partial in remote checksum offload,
and with the fixes setting checksum partial. The overall effect seems
be that using checksum partial is a slight performance win, perf
definitely shows a significant reduction of time in csum_partial on
the receive CPUs.

GRE/GUE
    TCP_STREAM
      Before fixes
        9.22% TX CPU utilization
        13.57% RX CPU utilization
        9133 Mbps
      Not using checksum partial
        9.59% TX CPU utilization
        14.95% RX CPU utilization
        9132 Mbps
      Using checksum partial
        9.37% TX CPU utilization
        13.89% RX CPU utilization
        9132 Mbps
    TCP_RR
      Before fixes
        CPU utilization
        159/251/447 90/95/99% latencies
        1.1462e+06 tps
      Not using checksum partial
        92.94% CPU utilization
        158/253/445 90/95/99% latencies
        1.12988e+06 tps
      Using checksum partial
        92.78% CPU utilization
        158/250/450 90/95/99% latencies
        1.15343e+06 tps

VXLAN
    TCP_STREAM
      Before fixes
        9.24% TX CPU utilization
        13.74% RX CPU utilization
        9093 Mbps
      Not using checksum partial
        9.95% TX CPU utilization
        14.66% RX CPU utilization
        9094 Mbps
      Using checksum partial
        10.24% TX CPU utilization
        13.32% RX CPU utilization
        9093 Mbps
    TCP_RR
      Before fixes
        92.91% CPU utilization
        151/241/437 90/95/99% latencies
        1.15939e+06 tps
      Not using checksum partial
        93.07% CPU utilization
        156/246/425 90/95/99% latencies
        1.1451e+06 tps
      Using checksum partial
        95.51% CPU utilization
        156/249/459 90/95/99% latencies
        1.17004e+06 tps
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2015-02-11 15:12:19 -08:00
commit 777b3e930a
11 changed files with 160 additions and 45 deletions

View File

@ -555,12 +555,13 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
unsigned int off,
struct vxlanhdr *vh, size_t hdrlen,
u32 data)
u32 data, struct gro_remcsum *grc,
bool nopartial)
{
size_t start, offset, plen;
if (skb->remcsum_offload)
return vh;
return NULL;
if (!NAPI_GRO_CB(skb)->csum_valid)
return NULL;
@ -579,7 +580,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
return NULL;
}
skb_gro_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
skb_gro_remcsum_process(skb, (void *)vh + hdrlen,
start, offset, grc, nopartial);
skb->remcsum_offload = 1;
@ -597,6 +599,9 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock,
udp_offloads);
u32 flags;
struct gro_remcsum grc;
skb_gro_remcsum_init(&grc);
off_vx = skb_gro_offset(skb);
hlen = off_vx + sizeof(*vh);
@ -614,7 +619,9 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
ntohl(vh->vx_vni));
ntohl(vh->vx_vni), &grc,
!!(vs->flags &
VXLAN_F_REMCSUM_NOPARTIAL));
if (!vh)
goto out;
@ -637,6 +644,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
pp = eth_gro_receive(head, skb);
out:
skb_gro_remcsum_cleanup(skb, &grc);
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
@ -1150,16 +1158,10 @@ static void vxlan_igmp_leave(struct work_struct *work)
}
static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
size_t hdrlen, u32 data)
size_t hdrlen, u32 data, bool nopartial)
{
size_t start, offset, plen;
if (skb->remcsum_offload) {
/* Already processed in GRO path */
skb->remcsum_offload = 0;
return vh;
}
start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
offset = start + ((data & VXLAN_RCO_UDP) ?
offsetof(struct udphdr, check) :
@ -1172,7 +1174,8 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
vh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset,
nopartial);
return vh;
}
@ -1209,7 +1212,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
goto drop;
if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni);
vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni,
!!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL));
if (!vxh)
goto drop;
@ -2438,6 +2442,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
[IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
[IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
[IFLA_VXLAN_GBP] = { .type = NLA_FLAG, },
[IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG },
};
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
@ -2761,6 +2766,9 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
if (data[IFLA_VXLAN_GBP])
vxlan->flags |= VXLAN_F_GBP;
if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
vxlan->flags |= VXLAN_F_REMCSUM_NOPARTIAL;
if (vxlan_find_vni(src_net, vni, use_ipv6 ? AF_INET6 : AF_INET,
vxlan->dst_port, vxlan->flags)) {
pr_info("duplicate VNI %u\n", vni);
@ -2910,6 +2918,10 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_flag(skb, IFLA_VXLAN_GBP))
goto nla_put_failure;
if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL &&
nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
goto nla_put_failure;
return 0;
nla_put_failure:

View File

@ -1923,13 +1923,8 @@ struct napi_gro_cb {
/* Number of segments aggregated. */
u16 count;
/* This is non-zero if the packet may be of the same flow. */
u8 same_flow;
/* Free the skb? */
u8 free;
#define NAPI_GRO_FREE 1
#define NAPI_GRO_FREE_STOLEN_HEAD 2
/* Start offset for remote checksum offload */
u16 gro_remcsum_start;
/* jiffies when first packet was created/queued */
unsigned long age;
@ -1937,6 +1932,9 @@ struct napi_gro_cb {
/* Used in ipv6_gro_receive() and foo-over-udp */
u16 proto;
/* This is non-zero if the packet may be of the same flow. */
u8 same_flow:1;
/* Used in udp_gro_receive */
u8 udp_mark:1;
@ -1946,9 +1944,16 @@ struct napi_gro_cb {
/* Number of checksums via CHECKSUM_UNNECESSARY */
u8 csum_cnt:3;
/* Free the skb? */
u8 free:2;
#define NAPI_GRO_FREE 1
#define NAPI_GRO_FREE_STOLEN_HEAD 2
/* Used in foo-over-udp, set in udp[46]_gro_receive */
u8 is_ipv6:1;
/* 7 bit hole */
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
@ -2242,11 +2247,20 @@ static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
__sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
{
return (NAPI_GRO_CB(skb)->gro_remcsum_start - skb_headroom(skb) ==
skb_gro_offset(skb));
}
static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
bool zero_okay,
__sum16 check)
{
return (skb->ip_summed != CHECKSUM_PARTIAL &&
return ((skb->ip_summed != CHECKSUM_PARTIAL ||
skb_checksum_start_offset(skb) <
skb_gro_offset(skb)) &&
!skb_at_gro_remcsum_start(skb) &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
(!zero_okay || check));
}
@ -2321,20 +2335,48 @@ do { \
compute_pseudo(skb, proto)); \
} while (0)
struct gro_remcsum {
int offset;
__wsum delta;
};
static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
{
grc->delta = 0;
}
static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
int start, int offset)
int start, int offset,
struct gro_remcsum *grc,
bool nopartial)
{
__wsum delta;
BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
if (!nopartial) {
NAPI_GRO_CB(skb)->gro_remcsum_start =
((unsigned char *)ptr + start) - skb->head;
return;
}
delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
/* Adjust skb->csum since we changed the packet */
skb->csum = csum_add(skb->csum, delta);
NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
grc->offset = (ptr + offset) - (void *)skb->head;
grc->delta = delta;
}
static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
struct gro_remcsum *grc)
{
if (!grc->delta)
return;
remcsum_unadjust((__sum16 *)(skb->head + grc->offset), grc->delta);
}
static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,

View File

@ -83,11 +83,15 @@
*
* CHECKSUM_PARTIAL:
*
* This is identical to the case for output below. This may occur on a packet
* A checksum is set up to be offloaded to a device as described in the
* output description for CHECKSUM_PARTIAL. This may occur on a packet
* received directly from another Linux OS, e.g., a virtualized Linux kernel
* on the same host. The packet can be treated in the same way as
* CHECKSUM_UNNECESSARY, except that on output (i.e., forwarding) the
* checksum must be filled in by the OS or the hardware.
* on the same host, or it may be set in the input path in GRO or remote
* checksum offload. For the purposes of checksum verification, the checksum
* referred to by skb->csum_start + skb->csum_offset and any preceding
* checksums in the packet are considered verified. Any checksums in the
* packet that are after the checksum being offloaded are not considered to
* be verified.
*
* B. Checksumming on output.
*
@ -2915,7 +2919,10 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb);
static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
return ((skb->ip_summed & CHECKSUM_UNNECESSARY) || skb->csum_valid);
return ((skb->ip_summed == CHECKSUM_UNNECESSARY) ||
skb->csum_valid ||
(skb->ip_summed == CHECKSUM_PARTIAL &&
skb_checksum_start_offset(skb) >= 0));
}
/**
@ -3097,16 +3104,29 @@ do { \
compute_pseudo(skb, proto)); \
} while (0)
static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
u16 start, u16 offset)
{
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
skb->csum_offset = offset - start;
}
/* Update skbuf and packet to reflect the remote checksum offload operation.
* When called, ptr indicates the starting point for skb->csum when
* ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
* here, skb_postpull_rcsum is done so skb->csum start is ptr.
*/
static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
int start, int offset)
int start, int offset, bool nopartial)
{
__wsum delta;
if (!nopartial) {
skb_remcsum_adjust_partial(skb, ptr, start, offset);
return;
}
if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
__skb_checksum_complete(skb);
skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);

View File

@ -167,4 +167,9 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum,
return delta;
}
static inline void remcsum_unadjust(__sum16 *psum, __wsum delta)
{
*psum = csum_fold(csum_sub(delta, *psum));
}
#endif

View File

@ -128,13 +128,15 @@ struct vxlan_sock {
#define VXLAN_F_REMCSUM_TX 0x200
#define VXLAN_F_REMCSUM_RX 0x400
#define VXLAN_F_GBP 0x800
#define VXLAN_F_REMCSUM_NOPARTIAL 0x1000
/* Flags that are used in the receive patch. These flags must match in
* order for a socket to be shareable
*/
#define VXLAN_F_RCV_FLAGS (VXLAN_F_GBP | \
VXLAN_F_UDP_ZERO_CSUM6_RX | \
VXLAN_F_REMCSUM_RX)
VXLAN_F_REMCSUM_RX | \
VXLAN_F_REMCSUM_NOPARTIAL)
struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
vxlan_rcv_t *rcv, void *data,

View File

@ -14,6 +14,7 @@ enum {
FOU_ATTR_AF, /* u8 */
FOU_ATTR_IPPROTO, /* u8 */
FOU_ATTR_TYPE, /* u8 */
FOU_ATTR_REMCSUM_NOPARTIAL, /* flag */
__FOU_ATTR_MAX,
};

View File

@ -374,6 +374,7 @@ enum {
IFLA_VXLAN_REMCSUM_TX,
IFLA_VXLAN_REMCSUM_RX,
IFLA_VXLAN_GBP,
IFLA_VXLAN_REMCSUM_NOPARTIAL,
__IFLA_VXLAN_MAX
};
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)

View File

@ -4024,6 +4024,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->udp_mark = 0;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
switch (skb->ip_summed) {

View File

@ -22,14 +22,18 @@ static LIST_HEAD(fou_list);
struct fou {
struct socket *sock;
u8 protocol;
u8 flags;
u16 port;
struct udp_offload udp_offloads;
struct list_head list;
};
#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
struct fou_cfg {
u16 type;
u8 protocol;
u8 flags;
struct udp_port_cfg udp_config;
};
@ -64,24 +68,20 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
}
static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
void *data, size_t hdrlen, u8 ipproto)
void *data, size_t hdrlen, u8 ipproto,
bool nopartial)
{
__be16 *pd = data;
size_t start = ntohs(pd[0]);
size_t offset = ntohs(pd[1]);
size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
if (skb->remcsum_offload) {
/* Already processed in GRO path */
skb->remcsum_offload = 0;
return guehdr;
}
if (!pskb_may_pull(skb, plen))
return NULL;
guehdr = (struct guehdr *)&udp_hdr(skb)[1];
skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
skb_remcsum_process(skb, (void *)guehdr + hdrlen,
start, offset, nopartial);
return guehdr;
}
@ -142,7 +142,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
if (flags & GUE_PFLAG_REMCSUM) {
guehdr = gue_remcsum(skb, guehdr, data + doffset,
hdrlen, guehdr->proto_ctype);
hdrlen, guehdr->proto_ctype,
!!(fou->flags &
FOU_F_REMCSUM_NOPARTIAL));
if (!guehdr)
goto drop;
@ -214,7 +216,8 @@ out_unlock:
static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
struct guehdr *guehdr, void *data,
size_t hdrlen, u8 ipproto)
size_t hdrlen, u8 ipproto,
struct gro_remcsum *grc, bool nopartial)
{
__be16 *pd = data;
size_t start = ntohs(pd[0]);
@ -222,7 +225,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
if (skb->remcsum_offload)
return guehdr;
return NULL;
if (!NAPI_GRO_CB(skb)->csum_valid)
return NULL;
@ -234,7 +237,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
return NULL;
}
skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
start, offset, grc, nopartial);
skb->remcsum_offload = 1;
@ -254,6 +258,10 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
void *data;
u16 doffset = 0;
int flush = 1;
struct fou *fou = container_of(uoff, struct fou, udp_offloads);
struct gro_remcsum grc;
skb_gro_remcsum_init(&grc);
off = skb_gro_offset(skb);
len = off + sizeof(*guehdr);
@ -295,7 +303,9 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
if (flags & GUE_PFLAG_REMCSUM) {
guehdr = gue_gro_remcsum(skb, off, guehdr,
data + doffset, hdrlen,
guehdr->proto_ctype);
guehdr->proto_ctype, &grc,
!!(fou->flags &
FOU_F_REMCSUM_NOPARTIAL));
if (!guehdr)
goto out;
@ -345,6 +355,7 @@ out_unlock:
rcu_read_unlock();
out:
NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_remcsum_cleanup(skb, &grc);
return pp;
}
@ -455,6 +466,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk = sock->sk;
fou->flags = cfg->flags;
fou->port = cfg->udp_config.local_udp_port;
/* Initial for fou type */
@ -541,6 +553,7 @@ static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
[FOU_ATTR_AF] = { .type = NLA_U8, },
[FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
[FOU_ATTR_TYPE] = { .type = NLA_U8, },
[FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
};
static int parse_nl_config(struct genl_info *info,
@ -571,6 +584,9 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_TYPE])
cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL])
cfg->flags |= FOU_F_REMCSUM_NOPARTIAL;
return 0;
}

View File

@ -402,6 +402,13 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
}
rcu_read_unlock();
if (skb->remcsum_offload)
skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;
skb->encapsulation = 1;
skb_set_inner_mac_header(skb, nhoff + sizeof(struct udphdr));
return err;
}
@ -410,9 +417,13 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
if (uh->check)
if (uh->check) {
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0);
} else {
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
}
return udp_gro_complete(skb, nhoff);
}

View File

@ -161,9 +161,13 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
if (uh->check)
if (uh->check) {
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
&ipv6h->daddr, 0);
} else {
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
}
return udp_gro_complete(skb, nhoff);
}