Files
linux/drivers/net/ethernet/rocker/rocker_ofdpa.c

2823 lines
75 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* drivers/net/ethernet/rocker/rocker_ofdpa.c - Rocker switch OF-DPA-like
* implementation
* Copyright (c) 2014 Scott Feldman <sfeldma@gmail.com>
* Copyright (c) 2014-2016 Jiri Pirko <jiri@mellanox.com>
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/hashtable.h>
#include <linux/crc32.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/if_vlan.h>
#include <linux/if_bridge.h>
#include <net/neighbour.h>
#include <net/switchdev.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/arp.h>
#include "rocker.h"
#include "rocker_tlv.h"
struct ofdpa_flow_tbl_key {
u32 priority;
enum rocker_of_dpa_table_id tbl_id;
union {
struct {
u32 in_pport;
u32 in_pport_mask;
enum rocker_of_dpa_table_id goto_tbl;
} ig_port;
struct {
u32 in_pport;
__be16 vlan_id;
__be16 vlan_id_mask;
enum rocker_of_dpa_table_id goto_tbl;
bool untagged;
__be16 new_vlan_id;
} vlan;
struct {
u32 in_pport;
u32 in_pport_mask;
__be16 eth_type;
u8 eth_dst[ETH_ALEN];
u8 eth_dst_mask[ETH_ALEN];
__be16 vlan_id;
__be16 vlan_id_mask;
enum rocker_of_dpa_table_id goto_tbl;
bool copy_to_cpu;
} term_mac;
struct {
__be16 eth_type;
__be32 dst4;
__be32 dst4_mask;
enum rocker_of_dpa_table_id goto_tbl;
u32 group_id;
} ucast_routing;
struct {
u8 eth_dst[ETH_ALEN];
u8 eth_dst_mask[ETH_ALEN];
int has_eth_dst;
int has_eth_dst_mask;
__be16 vlan_id;
u32 tunnel_id;
enum rocker_of_dpa_table_id goto_tbl;
u32 group_id;
bool copy_to_cpu;
} bridge;
struct {
u32 in_pport;
u32 in_pport_mask;
u8 eth_src[ETH_ALEN];
u8 eth_src_mask[ETH_ALEN];
u8 eth_dst[ETH_ALEN];
u8 eth_dst_mask[ETH_ALEN];
__be16 eth_type;
__be16 vlan_id;
__be16 vlan_id_mask;
u8 ip_proto;
u8 ip_proto_mask;
u8 ip_tos;
u8 ip_tos_mask;
u32 group_id;
} acl;
};
};
struct ofdpa_flow_tbl_entry {
struct hlist_node entry;
u32 cmd;
u64 cookie;
struct ofdpa_flow_tbl_key key;
size_t key_len;
u32 key_crc32; /* key */
struct fib_info *fi;
};
struct ofdpa_group_tbl_entry {
struct hlist_node entry;
u32 cmd;
u32 group_id; /* key */
u16 group_count;
u32 *group_ids;
union {
struct {
u8 pop_vlan;
} l2_interface;
struct {
u8 eth_src[ETH_ALEN];
u8 eth_dst[ETH_ALEN];
__be16 vlan_id;
u32 group_id;
} l2_rewrite;
struct {
u8 eth_src[ETH_ALEN];
u8 eth_dst[ETH_ALEN];
__be16 vlan_id;
bool ttl_check;
u32 group_id;
} l3_unicast;
};
};
struct ofdpa_fdb_tbl_entry {
struct hlist_node entry;
u32 key_crc32; /* key */
bool learned;
unsigned long touched;
struct ofdpa_fdb_tbl_key {
struct ofdpa_port *ofdpa_port;
u8 addr[ETH_ALEN];
__be16 vlan_id;
} key;
};
struct ofdpa_internal_vlan_tbl_entry {
struct hlist_node entry;
int ifindex; /* key */
u32 ref_count;
__be16 vlan_id;
};
struct ofdpa_neigh_tbl_entry {
struct hlist_node entry;
__be32 ip_addr; /* key */
struct net_device *dev;
u32 ref_count;
u32 index;
u8 eth_dst[ETH_ALEN];
bool ttl_check;
};
enum {
OFDPA_CTRL_LINK_LOCAL_MCAST,
OFDPA_CTRL_LOCAL_ARP,
OFDPA_CTRL_IPV4_MCAST,
OFDPA_CTRL_IPV6_MCAST,
OFDPA_CTRL_DFLT_BRIDGING,
OFDPA_CTRL_DFLT_OVS,
OFDPA_CTRL_MAX,
};
#define OFDPA_INTERNAL_VLAN_ID_BASE 0x0f00
#define OFDPA_N_INTERNAL_VLANS 255
#define OFDPA_VLAN_BITMAP_LEN BITS_TO_LONGS(VLAN_N_VID)
#define OFDPA_INTERNAL_VLAN_BITMAP_LEN BITS_TO_LONGS(OFDPA_N_INTERNAL_VLANS)
#define OFDPA_UNTAGGED_VID 0
struct ofdpa {
struct rocker *rocker;
DECLARE_HASHTABLE(flow_tbl, 16);
spinlock_t flow_tbl_lock; /* for flow tbl accesses */
u64 flow_tbl_next_cookie;
DECLARE_HASHTABLE(group_tbl, 16);
spinlock_t group_tbl_lock; /* for group tbl accesses */
struct timer_list fdb_cleanup_timer;
DECLARE_HASHTABLE(fdb_tbl, 16);
spinlock_t fdb_tbl_lock; /* for fdb tbl accesses */
unsigned long internal_vlan_bitmap[OFDPA_INTERNAL_VLAN_BITMAP_LEN];
DECLARE_HASHTABLE(internal_vlan_tbl, 8);
spinlock_t internal_vlan_tbl_lock; /* for vlan tbl accesses */
DECLARE_HASHTABLE(neigh_tbl, 16);
spinlock_t neigh_tbl_lock; /* for neigh tbl accesses */
u32 neigh_tbl_next_index;
unsigned long ageing_time;
bool fib_aborted;
};
struct ofdpa_port {
struct ofdpa *ofdpa;
struct rocker_port *rocker_port;
struct net_device *dev;
u32 pport;
struct net_device *bridge_dev;
__be16 internal_vlan_id;
int stp_state;
u32 brport_flags;
unsigned long ageing_time;
bool ctrls[OFDPA_CTRL_MAX];
unsigned long vlan_bitmap[OFDPA_VLAN_BITMAP_LEN];
};
static const u8 zero_mac[ETH_ALEN] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
static const u8 ff_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
static const u8 ll_mac[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
static const u8 ll_mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xf0 };
static const u8 mcast_mac[ETH_ALEN] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 };
static const u8 ipv4_mcast[ETH_ALEN] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };
static const u8 ipv4_mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 };
static const u8 ipv6_mcast[ETH_ALEN] = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };
static const u8 ipv6_mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 };
/* Rocker priority levels for flow table entries. Higher
* priority match takes precedence over lower priority match.
*/
enum {
OFDPA_PRIORITY_UNKNOWN = 0,
OFDPA_PRIORITY_IG_PORT = 1,
OFDPA_PRIORITY_VLAN = 1,
OFDPA_PRIORITY_TERM_MAC_UCAST = 0,
OFDPA_PRIORITY_TERM_MAC_MCAST = 1,
OFDPA_PRIORITY_BRIDGING_VLAN_DFLT_EXACT = 1,
OFDPA_PRIORITY_BRIDGING_VLAN_DFLT_WILD = 2,
OFDPA_PRIORITY_BRIDGING_VLAN = 3,
OFDPA_PRIORITY_BRIDGING_TENANT_DFLT_EXACT = 1,
OFDPA_PRIORITY_BRIDGING_TENANT_DFLT_WILD = 2,
OFDPA_PRIORITY_BRIDGING_TENANT = 3,
OFDPA_PRIORITY_ACL_CTRL = 3,
OFDPA_PRIORITY_ACL_NORMAL = 2,
OFDPA_PRIORITY_ACL_DFLT = 1,
};
static bool ofdpa_vlan_id_is_internal(__be16 vlan_id)
{
u16 start = OFDPA_INTERNAL_VLAN_ID_BASE;
u16 end = 0xffe;
u16 _vlan_id = ntohs(vlan_id);
return (_vlan_id >= start && _vlan_id <= end);
}
static __be16 ofdpa_port_vid_to_vlan(const struct ofdpa_port *ofdpa_port,
u16 vid, bool *pop_vlan)
{
__be16 vlan_id;
if (pop_vlan)
*pop_vlan = false;
vlan_id = htons(vid);
if (!vlan_id) {
vlan_id = ofdpa_port->internal_vlan_id;
if (pop_vlan)
*pop_vlan = true;
}
return vlan_id;
}
static u16 ofdpa_port_vlan_to_vid(const struct ofdpa_port *ofdpa_port,
__be16 vlan_id)
{
if (ofdpa_vlan_id_is_internal(vlan_id))
return 0;
return ntohs(vlan_id);
}
static bool ofdpa_port_is_slave(const struct ofdpa_port *ofdpa_port,
const char *kind)
{
return ofdpa_port->bridge_dev &&
!strcmp(ofdpa_port->bridge_dev->rtnl_link_ops->kind, kind);
}
static bool ofdpa_port_is_bridged(const struct ofdpa_port *ofdpa_port)
{
return ofdpa_port_is_slave(ofdpa_port, "bridge");
}
static bool ofdpa_port_is_ovsed(const struct ofdpa_port *ofdpa_port)
{
return ofdpa_port_is_slave(ofdpa_port, "openvswitch");
}
#define OFDPA_OP_FLAG_REMOVE BIT(0)
#define OFDPA_OP_FLAG_NOWAIT BIT(1)
#define OFDPA_OP_FLAG_LEARNED BIT(2)
#define OFDPA_OP_FLAG_REFRESH BIT(3)
static bool ofdpa_flags_nowait(int flags)
{
return flags & OFDPA_OP_FLAG_NOWAIT;
}
/*************************************************************
* Flow, group, FDB, internal VLAN and neigh command prepares
*************************************************************/
static int
ofdpa_cmd_flow_tbl_add_ig_port(struct rocker_desc_info *desc_info,
const struct ofdpa_flow_tbl_entry *entry)
{
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_PPORT,
entry->key.ig_port.in_pport))
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_PPORT_MASK,
entry->key.ig_port.in_pport_mask))
return -EMSGSIZE;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
entry->key.ig_port.goto_tbl))
return -EMSGSIZE;
return 0;
}
static int
ofdpa_cmd_flow_tbl_add_vlan(struct rocker_desc_info *desc_info,
const struct ofdpa_flow_tbl_entry *entry)
{
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_PPORT,
entry->key.vlan.in_pport))
return -EMSGSIZE;
if (rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
entry->key.vlan.vlan_id))
return -EMSGSIZE;
if (rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID_MASK,
entry->key.vlan.vlan_id_mask))
return -EMSGSIZE;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
entry->key.vlan.goto_tbl))
return -EMSGSIZE;
if (entry->key.vlan.untagged &&
rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_NEW_VLAN_ID,
entry->key.vlan.new_vlan_id))
return -EMSGSIZE;
return 0;
}
static int
ofdpa_cmd_flow_tbl_add_term_mac(struct rocker_desc_info *desc_info,
const struct ofdpa_flow_tbl_entry *entry)
{
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_PPORT,
entry->key.term_mac.in_pport))
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_PPORT_MASK,
entry->key.term_mac.in_pport_mask))
return -EMSGSIZE;
if (rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_ETHERTYPE,
entry->key.term_mac.eth_type))
return -EMSGSIZE;
if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
ETH_ALEN, entry->key.term_mac.eth_dst))
return -EMSGSIZE;
if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC_MASK,
ETH_ALEN, entry->key.term_mac.eth_dst_mask))
return -EMSGSIZE;
if (rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
entry->key.term_mac.vlan_id))
return -EMSGSIZE;
if (rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID_MASK,
entry->key.term_mac.vlan_id_mask))
return -EMSGSIZE;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
entry->key.term_mac.goto_tbl))
return -EMSGSIZE;
if (entry->key.term_mac.copy_to_cpu &&
rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_COPY_CPU_ACTION,
entry->key.term_mac.copy_to_cpu))
return -EMSGSIZE;
return 0;
}
static int
ofdpa_cmd_flow_tbl_add_ucast_routing(struct rocker_desc_info *desc_info,
const struct ofdpa_flow_tbl_entry *entry)
{
if (rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_ETHERTYPE,
entry->key.ucast_routing.eth_type))
return -EMSGSIZE;
if (rocker_tlv_put_be32(desc_info, ROCKER_TLV_OF_DPA_DST_IP,
entry->key.ucast_routing.dst4))
return -EMSGSIZE;
if (rocker_tlv_put_be32(desc_info, ROCKER_TLV_OF_DPA_DST_IP_MASK,
entry->key.ucast_routing.dst4_mask))
return -EMSGSIZE;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
entry->key.ucast_routing.goto_tbl))
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
entry->key.ucast_routing.group_id))
return -EMSGSIZE;
return 0;
}
static int
ofdpa_cmd_flow_tbl_add_bridge(struct rocker_desc_info *desc_info,
const struct ofdpa_flow_tbl_entry *entry)
{
if (entry->key.bridge.has_eth_dst &&
rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
ETH_ALEN, entry->key.bridge.eth_dst))
return -EMSGSIZE;
if (entry->key.bridge.has_eth_dst_mask &&
rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC_MASK,
ETH_ALEN, entry->key.bridge.eth_dst_mask))
return -EMSGSIZE;
if (entry->key.bridge.vlan_id &&
rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
entry->key.bridge.vlan_id))
return -EMSGSIZE;
if (entry->key.bridge.tunnel_id &&
rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_TUNNEL_ID,
entry->key.bridge.tunnel_id))
return -EMSGSIZE;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
entry->key.bridge.goto_tbl))
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
entry->key.bridge.group_id))
return -EMSGSIZE;
if (entry->key.bridge.copy_to_cpu &&
rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_COPY_CPU_ACTION,
entry->key.bridge.copy_to_cpu))
return -EMSGSIZE;
return 0;
}
static int
ofdpa_cmd_flow_tbl_add_acl(struct rocker_desc_info *desc_info,
const struct ofdpa_flow_tbl_entry *entry)
{
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_PPORT,
entry->key.acl.in_pport))
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_PPORT_MASK,
entry->key.acl.in_pport_mask))
return -EMSGSIZE;
if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_SRC_MAC,
ETH_ALEN, entry->key.acl.eth_src))
return -EMSGSIZE;
if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_SRC_MAC_MASK,
ETH_ALEN, entry->key.acl.eth_src_mask))
return -EMSGSIZE;
if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
ETH_ALEN, entry->key.acl.eth_dst))
return -EMSGSIZE;
if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC_MASK,
ETH_ALEN, entry->key.acl.eth_dst_mask))
return -EMSGSIZE;
if (rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_ETHERTYPE,
entry->key.acl.eth_type))
return -EMSGSIZE;
if (rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
entry->key.acl.vlan_id))
return -EMSGSIZE;
if (rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID_MASK,
entry->key.acl.vlan_id_mask))
return -EMSGSIZE;
switch (ntohs(entry->key.acl.eth_type)) {
case ETH_P_IP:
case ETH_P_IPV6:
if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_IP_PROTO,
entry->key.acl.ip_proto))
return -EMSGSIZE;
if (rocker_tlv_put_u8(desc_info,
ROCKER_TLV_OF_DPA_IP_PROTO_MASK,
entry->key.acl.ip_proto_mask))
return -EMSGSIZE;
if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_IP_DSCP,
entry->key.acl.ip_tos & 0x3f))
return -EMSGSIZE;
if (rocker_tlv_put_u8(desc_info,
ROCKER_TLV_OF_DPA_IP_DSCP_MASK,
entry->key.acl.ip_tos_mask & 0x3f))
return -EMSGSIZE;
if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_IP_ECN,
(entry->key.acl.ip_tos & 0xc0) >> 6))
return -EMSGSIZE;
if (rocker_tlv_put_u8(desc_info,
ROCKER_TLV_OF_DPA_IP_ECN_MASK,
(entry->key.acl.ip_tos_mask & 0xc0) >> 6))
return -EMSGSIZE;
break;
}
if (entry->key.acl.group_id != ROCKER_GROUP_NONE &&
rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
entry->key.acl.group_id))
return -EMSGSIZE;
return 0;
}
static int ofdpa_cmd_flow_tbl_add(const struct rocker_port *rocker_port,
struct rocker_desc_info *desc_info,
void *priv)
{
const struct ofdpa_flow_tbl_entry *entry = priv;
struct rocker_tlv *cmd_info;
int err = 0;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, entry->cmd))
return -EMSGSIZE;
cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO);
if (!cmd_info)
return -EMSGSIZE;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_TABLE_ID,
entry->key.tbl_id))
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_PRIORITY,
entry->key.priority))
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_HARDTIME, 0))
return -EMSGSIZE;
if (rocker_tlv_put_u64(desc_info, ROCKER_TLV_OF_DPA_COOKIE,
entry->cookie))
return -EMSGSIZE;
switch (entry->key.tbl_id) {
case ROCKER_OF_DPA_TABLE_ID_INGRESS_PORT:
err = ofdpa_cmd_flow_tbl_add_ig_port(desc_info, entry);
break;
case ROCKER_OF_DPA_TABLE_ID_VLAN:
err = ofdpa_cmd_flow_tbl_add_vlan(desc_info, entry);
break;
case ROCKER_OF_DPA_TABLE_ID_TERMINATION_MAC:
err = ofdpa_cmd_flow_tbl_add_term_mac(desc_info, entry);
break;
case ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING:
err = ofdpa_cmd_flow_tbl_add_ucast_routing(desc_info, entry);
break;
case ROCKER_OF_DPA_TABLE_ID_BRIDGING:
err = ofdpa_cmd_flow_tbl_add_bridge(desc_info, entry);
break;
case ROCKER_OF_DPA_TABLE_ID_ACL_POLICY:
err = ofdpa_cmd_flow_tbl_add_acl(desc_info, entry);
break;
default:
err = -ENOTSUPP;
break;
}
if (err)
return err;
rocker_tlv_nest_end(desc_info, cmd_info);
return 0;
}
static int ofdpa_cmd_flow_tbl_del(const struct rocker_port *rocker_port,
struct rocker_desc_info *desc_info,
void *priv)
{
const struct ofdpa_flow_tbl_entry *entry = priv;
struct rocker_tlv *cmd_info;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, entry->cmd))
return -EMSGSIZE;
cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO);
if (!cmd_info)
return -EMSGSIZE;
if (rocker_tlv_put_u64(desc_info, ROCKER_TLV_OF_DPA_COOKIE,
entry->cookie))
return -EMSGSIZE;
rocker_tlv_nest_end(desc_info, cmd_info);
return 0;
}
static int
ofdpa_cmd_group_tbl_add_l2_interface(struct rocker_desc_info *desc_info,
struct ofdpa_group_tbl_entry *entry)
{
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_OUT_PPORT,
ROCKER_GROUP_PORT_GET(entry->group_id)))
return -EMSGSIZE;
if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_POP_VLAN,
entry->l2_interface.pop_vlan))
return -EMSGSIZE;
return 0;
}
static int
ofdpa_cmd_group_tbl_add_l2_rewrite(struct rocker_desc_info *desc_info,
const struct ofdpa_group_tbl_entry *entry)
{
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID_LOWER,
entry->l2_rewrite.group_id))
return -EMSGSIZE;
if (!is_zero_ether_addr(entry->l2_rewrite.eth_src) &&
rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_SRC_MAC,
ETH_ALEN, entry->l2_rewrite.eth_src))
return -EMSGSIZE;
if (!is_zero_ether_addr(entry->l2_rewrite.eth_dst) &&
rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
ETH_ALEN, entry->l2_rewrite.eth_dst))
return -EMSGSIZE;
if (entry->l2_rewrite.vlan_id &&
rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
entry->l2_rewrite.vlan_id))
return -EMSGSIZE;
return 0;
}
static int
ofdpa_cmd_group_tbl_add_group_ids(struct rocker_desc_info *desc_info,
const struct ofdpa_group_tbl_entry *entry)
{
int i;
struct rocker_tlv *group_ids;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GROUP_COUNT,
entry->group_count))
return -EMSGSIZE;
group_ids = rocker_tlv_nest_start(desc_info,
ROCKER_TLV_OF_DPA_GROUP_IDS);
if (!group_ids)
return -EMSGSIZE;
for (i = 0; i < entry->group_count; i++)
/* Note TLV array is 1-based */
if (rocker_tlv_put_u32(desc_info, i + 1, entry->group_ids[i]))
return -EMSGSIZE;
rocker_tlv_nest_end(desc_info, group_ids);
return 0;
}
static int
ofdpa_cmd_group_tbl_add_l3_unicast(struct rocker_desc_info *desc_info,
const struct ofdpa_group_tbl_entry *entry)
{
if (!is_zero_ether_addr(entry->l3_unicast.eth_src) &&
rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_SRC_MAC,
ETH_ALEN, entry->l3_unicast.eth_src))
return -EMSGSIZE;
if (!is_zero_ether_addr(entry->l3_unicast.eth_dst) &&
rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
ETH_ALEN, entry->l3_unicast.eth_dst))
return -EMSGSIZE;
if (entry->l3_unicast.vlan_id &&
rocker_tlv_put_be16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
entry->l3_unicast.vlan_id))
return -EMSGSIZE;
if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_TTL_CHECK,
entry->l3_unicast.ttl_check))
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID_LOWER,
entry->l3_unicast.group_id))
return -EMSGSIZE;
return 0;
}
static int ofdpa_cmd_group_tbl_add(const struct rocker_port *rocker_port,
struct rocker_desc_info *desc_info,
void *priv)
{
struct ofdpa_group_tbl_entry *entry = priv;
struct rocker_tlv *cmd_info;
int err = 0;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, entry->cmd))
return -EMSGSIZE;
cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO);
if (!cmd_info)
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
entry->group_id))
return -EMSGSIZE;
switch (ROCKER_GROUP_TYPE_GET(entry->group_id)) {
case ROCKER_OF_DPA_GROUP_TYPE_L2_INTERFACE:
err = ofdpa_cmd_group_tbl_add_l2_interface(desc_info, entry);
break;
case ROCKER_OF_DPA_GROUP_TYPE_L2_REWRITE:
err = ofdpa_cmd_group_tbl_add_l2_rewrite(desc_info, entry);
break;
case ROCKER_OF_DPA_GROUP_TYPE_L2_FLOOD:
case ROCKER_OF_DPA_GROUP_TYPE_L2_MCAST:
err = ofdpa_cmd_group_tbl_add_group_ids(desc_info, entry);
break;
case ROCKER_OF_DPA_GROUP_TYPE_L3_UCAST:
err = ofdpa_cmd_group_tbl_add_l3_unicast(desc_info, entry);
break;
default:
err = -ENOTSUPP;
break;
}
if (err)
return err;
rocker_tlv_nest_end(desc_info, cmd_info);
return 0;
}
static int ofdpa_cmd_group_tbl_del(const struct rocker_port *rocker_port,
struct rocker_desc_info *desc_info,
void *priv)
{
const struct ofdpa_group_tbl_entry *entry = priv;
struct rocker_tlv *cmd_info;
if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, entry->cmd))
return -EMSGSIZE;
cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO);
if (!cmd_info)
return -EMSGSIZE;
if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
entry->group_id))
return -EMSGSIZE;
rocker_tlv_nest_end(desc_info, cmd_info);
return 0;
}
/***************************************************
* Flow, group, FDB, internal VLAN and neigh tables
***************************************************/
static struct ofdpa_flow_tbl_entry *
ofdpa_flow_tbl_find(const struct ofdpa *ofdpa,
const struct ofdpa_flow_tbl_entry *match)
{
struct ofdpa_flow_tbl_entry *found;
size_t key_len = match->key_len ? match->key_len : sizeof(found->key);
hash_for_each_possible(ofdpa->flow_tbl, found,
entry, match->key_crc32) {
if (memcmp(&found->key, &match->key, key_len) == 0)
return found;
}
return NULL;
}
static int ofdpa_flow_tbl_add(struct ofdpa_port *ofdpa_port,
int flags, struct ofdpa_flow_tbl_entry *match)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_flow_tbl_entry *found;
size_t key_len = match->key_len ? match->key_len : sizeof(found->key);
unsigned long lock_flags;
match->key_crc32 = crc32(~0, &match->key, key_len);
spin_lock_irqsave(&ofdpa->flow_tbl_lock, lock_flags);
found = ofdpa_flow_tbl_find(ofdpa, match);
if (found) {
match->cookie = found->cookie;
hash_del(&found->entry);
kfree(found);
found = match;
found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_MOD;
} else {
found = match;
found->cookie = ofdpa->flow_tbl_next_cookie++;
found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_ADD;
}
hash_add(ofdpa->flow_tbl, &found->entry, found->key_crc32);
spin_unlock_irqrestore(&ofdpa->flow_tbl_lock, lock_flags);
return rocker_cmd_exec(ofdpa_port->rocker_port,
ofdpa_flags_nowait(flags),
ofdpa_cmd_flow_tbl_add,
found, NULL, NULL);
}
static int ofdpa_flow_tbl_del(struct ofdpa_port *ofdpa_port,
int flags, struct ofdpa_flow_tbl_entry *match)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_flow_tbl_entry *found;
size_t key_len = match->key_len ? match->key_len : sizeof(found->key);
unsigned long lock_flags;
int err = 0;
match->key_crc32 = crc32(~0, &match->key, key_len);
spin_lock_irqsave(&ofdpa->flow_tbl_lock, lock_flags);
found = ofdpa_flow_tbl_find(ofdpa, match);
if (found) {
hash_del(&found->entry);
found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_DEL;
}
spin_unlock_irqrestore(&ofdpa->flow_tbl_lock, lock_flags);
kfree(match);
if (found) {
err = rocker_cmd_exec(ofdpa_port->rocker_port,
ofdpa_flags_nowait(flags),
ofdpa_cmd_flow_tbl_del,
found, NULL, NULL);
kfree(found);
}
return err;
}
static int ofdpa_flow_tbl_do(struct ofdpa_port *ofdpa_port, int flags,
struct ofdpa_flow_tbl_entry *entry)
{
if (flags & OFDPA_OP_FLAG_REMOVE)
return ofdpa_flow_tbl_del(ofdpa_port, flags, entry);
else
return ofdpa_flow_tbl_add(ofdpa_port, flags, entry);
}
static int ofdpa_flow_tbl_ig_port(struct ofdpa_port *ofdpa_port, int flags,
u32 in_pport, u32 in_pport_mask,
enum rocker_of_dpa_table_id goto_tbl)
{
struct ofdpa_flow_tbl_entry *entry;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
entry->key.priority = OFDPA_PRIORITY_IG_PORT;
entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_INGRESS_PORT;
entry->key.ig_port.in_pport = in_pport;
entry->key.ig_port.in_pport_mask = in_pport_mask;
entry->key.ig_port.goto_tbl = goto_tbl;
return ofdpa_flow_tbl_do(ofdpa_port, flags, entry);
}
static int ofdpa_flow_tbl_vlan(struct ofdpa_port *ofdpa_port,
int flags,
u32 in_pport, __be16 vlan_id,
__be16 vlan_id_mask,
enum rocker_of_dpa_table_id goto_tbl,
bool untagged, __be16 new_vlan_id)
{
struct ofdpa_flow_tbl_entry *entry;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
entry->key.priority = OFDPA_PRIORITY_VLAN;
entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_VLAN;
entry->key.vlan.in_pport = in_pport;
entry->key.vlan.vlan_id = vlan_id;
entry->key.vlan.vlan_id_mask = vlan_id_mask;
entry->key.vlan.goto_tbl = goto_tbl;
entry->key.vlan.untagged = untagged;
entry->key.vlan.new_vlan_id = new_vlan_id;
return ofdpa_flow_tbl_do(ofdpa_port, flags, entry);
}
static int ofdpa_flow_tbl_term_mac(struct ofdpa_port *ofdpa_port,
u32 in_pport, u32 in_pport_mask,
__be16 eth_type, const u8 *eth_dst,
const u8 *eth_dst_mask, __be16 vlan_id,
__be16 vlan_id_mask, bool copy_to_cpu,
int flags)
{
struct ofdpa_flow_tbl_entry *entry;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
if (is_multicast_ether_addr(eth_dst)) {
entry->key.priority = OFDPA_PRIORITY_TERM_MAC_MCAST;
entry->key.term_mac.goto_tbl =
ROCKER_OF_DPA_TABLE_ID_MULTICAST_ROUTING;
} else {
entry->key.priority = OFDPA_PRIORITY_TERM_MAC_UCAST;
entry->key.term_mac.goto_tbl =
ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING;
}
entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_TERMINATION_MAC;
entry->key.term_mac.in_pport = in_pport;
entry->key.term_mac.in_pport_mask = in_pport_mask;
entry->key.term_mac.eth_type = eth_type;
ether_addr_copy(entry->key.term_mac.eth_dst, eth_dst);
ether_addr_copy(entry->key.term_mac.eth_dst_mask, eth_dst_mask);
entry->key.term_mac.vlan_id = vlan_id;
entry->key.term_mac.vlan_id_mask = vlan_id_mask;
entry->key.term_mac.copy_to_cpu = copy_to_cpu;
return ofdpa_flow_tbl_do(ofdpa_port, flags, entry);
}
static int ofdpa_flow_tbl_bridge(struct ofdpa_port *ofdpa_port,
int flags, const u8 *eth_dst,
const u8 *eth_dst_mask, __be16 vlan_id,
u32 tunnel_id,
enum rocker_of_dpa_table_id goto_tbl,
u32 group_id, bool copy_to_cpu)
{
struct ofdpa_flow_tbl_entry *entry;
u32 priority;
bool vlan_bridging = !!vlan_id;
bool dflt = !eth_dst || eth_dst_mask;
bool wild = false;
entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
if (!entry)
return -ENOMEM;
entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_BRIDGING;
if (eth_dst) {
entry->key.bridge.has_eth_dst = 1;
ether_addr_copy(entry->key.bridge.eth_dst, eth_dst);
}
if (eth_dst_mask) {
entry->key.bridge.has_eth_dst_mask = 1;
ether_addr_copy(entry->key.bridge.eth_dst_mask, eth_dst_mask);
if (!ether_addr_equal(eth_dst_mask, ff_mac))
wild = true;
}
priority = OFDPA_PRIORITY_UNKNOWN;
if (vlan_bridging && dflt && wild)
priority = OFDPA_PRIORITY_BRIDGING_VLAN_DFLT_WILD;
else if (vlan_bridging && dflt && !wild)
priority = OFDPA_PRIORITY_BRIDGING_VLAN_DFLT_EXACT;
else if (vlan_bridging && !dflt)
priority = OFDPA_PRIORITY_BRIDGING_VLAN;
else if (!vlan_bridging && dflt && wild)
priority = OFDPA_PRIORITY_BRIDGING_TENANT_DFLT_WILD;
else if (!vlan_bridging && dflt && !wild)
priority = OFDPA_PRIORITY_BRIDGING_TENANT_DFLT_EXACT;
else if (!vlan_bridging && !dflt)
priority = OFDPA_PRIORITY_BRIDGING_TENANT;
entry->key.priority = priority;
entry->key.bridge.vlan_id = vlan_id;
entry->key.bridge.tunnel_id = tunnel_id;
entry->key.bridge.goto_tbl = goto_tbl;
entry->key.bridge.group_id = group_id;
entry->key.bridge.copy_to_cpu = copy_to_cpu;
return ofdpa_flow_tbl_do(ofdpa_port, flags, entry);
}
static int ofdpa_flow_tbl_ucast4_routing(struct ofdpa_port *ofdpa_port,
__be16 eth_type, __be32 dst,
__be32 dst_mask, u32 priority,
enum rocker_of_dpa_table_id goto_tbl,
u32 group_id, struct fib_info *fi,
int flags)
{
struct ofdpa_flow_tbl_entry *entry;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING;
entry->key.priority = priority;
entry->key.ucast_routing.eth_type = eth_type;
entry->key.ucast_routing.dst4 = dst;
entry->key.ucast_routing.dst4_mask = dst_mask;
entry->key.ucast_routing.goto_tbl = goto_tbl;
entry->key.ucast_routing.group_id = group_id;
entry->key_len = offsetof(struct ofdpa_flow_tbl_key,
ucast_routing.group_id);
entry->fi = fi;
return ofdpa_flow_tbl_do(ofdpa_port, flags, entry);
}
static int ofdpa_flow_tbl_acl(struct ofdpa_port *ofdpa_port, int flags,
u32 in_pport, u32 in_pport_mask,
const u8 *eth_src, const u8 *eth_src_mask,
const u8 *eth_dst, const u8 *eth_dst_mask,
__be16 eth_type, __be16 vlan_id,
__be16 vlan_id_mask, u8 ip_proto,
u8 ip_proto_mask, u8 ip_tos, u8 ip_tos_mask,
u32 group_id)
{
u32 priority;
struct ofdpa_flow_tbl_entry *entry;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
priority = OFDPA_PRIORITY_ACL_NORMAL;
if (eth_dst && eth_dst_mask) {
if (ether_addr_equal(eth_dst_mask, mcast_mac))
priority = OFDPA_PRIORITY_ACL_DFLT;
else if (is_link_local_ether_addr(eth_dst))
priority = OFDPA_PRIORITY_ACL_CTRL;
}
entry->key.priority = priority;
entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
entry->key.acl.in_pport = in_pport;
entry->key.acl.in_pport_mask = in_pport_mask;
if (eth_src)
ether_addr_copy(entry->key.acl.eth_src, eth_src);
if (eth_src_mask)
ether_addr_copy(entry->key.acl.eth_src_mask, eth_src_mask);
if (eth_dst)
ether_addr_copy(entry->key.acl.eth_dst, eth_dst);
if (eth_dst_mask)
ether_addr_copy(entry->key.acl.eth_dst_mask, eth_dst_mask);
entry->key.acl.eth_type = eth_type;
entry->key.acl.vlan_id = vlan_id;
entry->key.acl.vlan_id_mask = vlan_id_mask;
entry->key.acl.ip_proto = ip_proto;
entry->key.acl.ip_proto_mask = ip_proto_mask;
entry->key.acl.ip_tos = ip_tos;
entry->key.acl.ip_tos_mask = ip_tos_mask;
entry->key.acl.group_id = group_id;
return ofdpa_flow_tbl_do(ofdpa_port, flags, entry);
}
static struct ofdpa_group_tbl_entry *
ofdpa_group_tbl_find(const struct ofdpa *ofdpa,
const struct ofdpa_group_tbl_entry *match)
{
struct ofdpa_group_tbl_entry *found;
hash_for_each_possible(ofdpa->group_tbl, found,
entry, match->group_id) {
if (found->group_id == match->group_id)
return found;
}
return NULL;
}
static void ofdpa_group_tbl_entry_free(struct ofdpa_group_tbl_entry *entry)
{
switch (ROCKER_GROUP_TYPE_GET(entry->group_id)) {
case ROCKER_OF_DPA_GROUP_TYPE_L2_FLOOD:
case ROCKER_OF_DPA_GROUP_TYPE_L2_MCAST:
kfree(entry->group_ids);
break;
default:
break;
}
kfree(entry);
}
static int ofdpa_group_tbl_add(struct ofdpa_port *ofdpa_port, int flags,
struct ofdpa_group_tbl_entry *match)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_group_tbl_entry *found;
unsigned long lock_flags;
spin_lock_irqsave(&ofdpa->group_tbl_lock, lock_flags);
found = ofdpa_group_tbl_find(ofdpa, match);
if (found) {
hash_del(&found->entry);
ofdpa_group_tbl_entry_free(found);
found = match;
found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_GROUP_MOD;
} else {
found = match;
found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_GROUP_ADD;
}
hash_add(ofdpa->group_tbl, &found->entry, found->group_id);
spin_unlock_irqrestore(&ofdpa->group_tbl_lock, lock_flags);
return rocker_cmd_exec(ofdpa_port->rocker_port,
ofdpa_flags_nowait(flags),
ofdpa_cmd_group_tbl_add,
found, NULL, NULL);
}
static int ofdpa_group_tbl_del(struct ofdpa_port *ofdpa_port, int flags,
struct ofdpa_group_tbl_entry *match)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_group_tbl_entry *found;
unsigned long lock_flags;
int err = 0;
spin_lock_irqsave(&ofdpa->group_tbl_lock, lock_flags);
found = ofdpa_group_tbl_find(ofdpa, match);
if (found) {
hash_del(&found->entry);
found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_GROUP_DEL;
}
spin_unlock_irqrestore(&ofdpa->group_tbl_lock, lock_flags);
ofdpa_group_tbl_entry_free(match);
if (found) {
err = rocker_cmd_exec(ofdpa_port->rocker_port,
ofdpa_flags_nowait(flags),
ofdpa_cmd_group_tbl_del,
found, NULL, NULL);
ofdpa_group_tbl_entry_free(found);
}
return err;
}
static int ofdpa_group_tbl_do(struct ofdpa_port *ofdpa_port, int flags,
struct ofdpa_group_tbl_entry *entry)
{
if (flags & OFDPA_OP_FLAG_REMOVE)
return ofdpa_group_tbl_del(ofdpa_port, flags, entry);
else
return ofdpa_group_tbl_add(ofdpa_port, flags, entry);
}
static int ofdpa_group_l2_interface(struct ofdpa_port *ofdpa_port,
int flags, __be16 vlan_id,
u32 out_pport, int pop_vlan)
{
struct ofdpa_group_tbl_entry *entry;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
entry->group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, out_pport);
entry->l2_interface.pop_vlan = pop_vlan;
return ofdpa_group_tbl_do(ofdpa_port, flags, entry);
}
static int ofdpa_group_l2_fan_out(struct ofdpa_port *ofdpa_port,
int flags, u8 group_count,
const u32 *group_ids, u32 group_id)
{
struct ofdpa_group_tbl_entry *entry;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
entry->group_id = group_id;
entry->group_count = group_count;
entry->group_ids = kcalloc(group_count, sizeof(u32), GFP_KERNEL);
if (!entry->group_ids) {
kfree(entry);
return -ENOMEM;
}
memcpy(entry->group_ids, group_ids, group_count * sizeof(u32));
return ofdpa_group_tbl_do(ofdpa_port, flags, entry);
}
static int ofdpa_group_l2_flood(struct ofdpa_port *ofdpa_port,
int flags, __be16 vlan_id,
u8 group_count, const u32 *group_ids,
u32 group_id)
{
return ofdpa_group_l2_fan_out(ofdpa_port, flags,
group_count, group_ids,
group_id);
}
static int ofdpa_group_l3_unicast(struct ofdpa_port *ofdpa_port, int flags,
u32 index, const u8 *src_mac, const u8 *dst_mac,
__be16 vlan_id, bool ttl_check, u32 pport)
{
struct ofdpa_group_tbl_entry *entry;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
entry->group_id = ROCKER_GROUP_L3_UNICAST(index);
if (src_mac)
ether_addr_copy(entry->l3_unicast.eth_src, src_mac);
if (dst_mac)
ether_addr_copy(entry->l3_unicast.eth_dst, dst_mac);
entry->l3_unicast.vlan_id = vlan_id;
entry->l3_unicast.ttl_check = ttl_check;
entry->l3_unicast.group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, pport);
return ofdpa_group_tbl_do(ofdpa_port, flags, entry);
}
static struct ofdpa_neigh_tbl_entry *
ofdpa_neigh_tbl_find(const struct ofdpa *ofdpa, __be32 ip_addr)
{
struct ofdpa_neigh_tbl_entry *found;
hash_for_each_possible(ofdpa->neigh_tbl, found,
entry, be32_to_cpu(ip_addr))
if (found->ip_addr == ip_addr)
return found;
return NULL;
}
static void ofdpa_neigh_add(struct ofdpa *ofdpa,
struct ofdpa_neigh_tbl_entry *entry)
{
entry->index = ofdpa->neigh_tbl_next_index++;
entry->ref_count++;
hash_add(ofdpa->neigh_tbl, &entry->entry,
be32_to_cpu(entry->ip_addr));
}
static void ofdpa_neigh_del(struct ofdpa_neigh_tbl_entry *entry)
{
if (--entry->ref_count == 0) {
hash_del(&entry->entry);
kfree(entry);
}
}
static void ofdpa_neigh_update(struct ofdpa_neigh_tbl_entry *entry,
const u8 *eth_dst, bool ttl_check)
{
if (eth_dst) {
ether_addr_copy(entry->eth_dst, eth_dst);
entry->ttl_check = ttl_check;
} else {
entry->ref_count++;
}
}
static int ofdpa_port_ipv4_neigh(struct ofdpa_port *ofdpa_port,
int flags, __be32 ip_addr, const u8 *eth_dst)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_neigh_tbl_entry *entry;
struct ofdpa_neigh_tbl_entry *found;
unsigned long lock_flags;
__be16 eth_type = htons(ETH_P_IP);
enum rocker_of_dpa_table_id goto_tbl =
ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
u32 group_id;
u32 priority = 0;
bool adding = !(flags & OFDPA_OP_FLAG_REMOVE);
bool updating;
bool removing;
int err = 0;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
spin_lock_irqsave(&ofdpa->neigh_tbl_lock, lock_flags);
found = ofdpa_neigh_tbl_find(ofdpa, ip_addr);
updating = found && adding;
removing = found && !adding;
adding = !found && adding;
if (adding) {
entry->ip_addr = ip_addr;
entry->dev = ofdpa_port->dev;
ether_addr_copy(entry->eth_dst, eth_dst);
entry->ttl_check = true;
ofdpa_neigh_add(ofdpa, entry);
} else if (removing) {
memcpy(entry, found, sizeof(*entry));
ofdpa_neigh_del(found);
} else if (updating) {
ofdpa_neigh_update(found, eth_dst, true);
memcpy(entry, found, sizeof(*entry));
} else {
err = -ENOENT;
}
spin_unlock_irqrestore(&ofdpa->neigh_tbl_lock, lock_flags);
if (err)
goto err_out;
/* For each active neighbor, we have an L3 unicast group and
* a /32 route to the neighbor, which uses the L3 unicast
* group. The L3 unicast group can also be referred to by
* other routes' nexthops.
*/
err = ofdpa_group_l3_unicast(ofdpa_port, flags,
entry->index,
ofdpa_port->dev->dev_addr,
entry->eth_dst,
ofdpa_port->internal_vlan_id,
entry->ttl_check,
ofdpa_port->pport);
if (err) {
netdev_err(ofdpa_port->dev, "Error (%d) L3 unicast group index %d\n",
err, entry->index);
goto err_out;
}
if (adding || removing) {
group_id = ROCKER_GROUP_L3_UNICAST(entry->index);
err = ofdpa_flow_tbl_ucast4_routing(ofdpa_port,
eth_type, ip_addr,
inet_make_mask(32),
priority, goto_tbl,
group_id, NULL, flags);
if (err)
netdev_err(ofdpa_port->dev, "Error (%d) /32 unicast route %pI4 group 0x%08x\n",
err, &entry->ip_addr, group_id);
}
err_out:
if (!adding)
kfree(entry);
return err;
}
static int ofdpa_port_ipv4_resolve(struct ofdpa_port *ofdpa_port,
__be32 ip_addr)
{
struct net_device *dev = ofdpa_port->dev;
struct neighbour *n = __ipv4_neigh_lookup(dev, (__force u32)ip_addr);
int err = 0;
if (!n) {
n = neigh_create(&arp_tbl, &ip_addr, dev);
if (IS_ERR(n))
return PTR_ERR(n);
}
/* If the neigh is already resolved, then go ahead and
* install the entry, otherwise start the ARP process to
* resolve the neigh.
*/
if (n->nud_state & NUD_VALID)
err = ofdpa_port_ipv4_neigh(ofdpa_port, 0,
ip_addr, n->ha);
else
neigh_event_send(n, NULL);
neigh_release(n);
return err;
}
static int ofdpa_port_ipv4_nh(struct ofdpa_port *ofdpa_port,
int flags, __be32 ip_addr, u32 *index)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_neigh_tbl_entry *entry;
struct ofdpa_neigh_tbl_entry *found;
unsigned long lock_flags;
bool adding = !(flags & OFDPA_OP_FLAG_REMOVE);
bool updating;
bool removing;
bool resolved = true;
int err = 0;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
spin_lock_irqsave(&ofdpa->neigh_tbl_lock, lock_flags);
found = ofdpa_neigh_tbl_find(ofdpa, ip_addr);
updating = found && adding;
removing = found && !adding;
adding = !found && adding;
if (adding) {
entry->ip_addr = ip_addr;
entry->dev = ofdpa_port->dev;
ofdpa_neigh_add(ofdpa, entry);
*index = entry->index;
resolved = false;
} else if (removing) {
*index = found->index;
ofdpa_neigh_del(found);
} else if (updating) {
ofdpa_neigh_update(found, NULL, false);
resolved = !is_zero_ether_addr(found->eth_dst);
*index = found->index;
} else {
err = -ENOENT;
}
spin_unlock_irqrestore(&ofdpa->neigh_tbl_lock, lock_flags);
if (!adding)
kfree(entry);
if (err)
return err;
/* Resolved means neigh ip_addr is resolved to neigh mac. */
if (!resolved)
err = ofdpa_port_ipv4_resolve(ofdpa_port, ip_addr);
return err;
}
static struct ofdpa_port *ofdpa_port_get(const struct ofdpa *ofdpa,
int port_index)
{
struct rocker_port *rocker_port;
rocker_port = ofdpa->rocker->ports[port_index];
return rocker_port ? rocker_port->wpriv : NULL;
}
static int ofdpa_port_vlan_flood_group(struct ofdpa_port *ofdpa_port,
int flags, __be16 vlan_id)
{
struct ofdpa_port *p;
const struct ofdpa *ofdpa = ofdpa_port->ofdpa;
unsigned int port_count = ofdpa->rocker->port_count;
u32 group_id = ROCKER_GROUP_L2_FLOOD(vlan_id, 0);
u32 *group_ids;
u8 group_count = 0;
int err = 0;
int i;
group_ids = kcalloc(port_count, sizeof(u32), GFP_KERNEL);
if (!group_ids)
return -ENOMEM;
/* Adjust the flood group for this VLAN. The flood group
* references an L2 interface group for each port in this
* VLAN.
*/
for (i = 0; i < port_count; i++) {
p = ofdpa_port_get(ofdpa, i);
if (!p)
continue;
if (!ofdpa_port_is_bridged(p))
continue;
if (test_bit(ntohs(vlan_id), p->vlan_bitmap)) {
group_ids[group_count++] =
ROCKER_GROUP_L2_INTERFACE(vlan_id, p->pport);
}
}
/* If there are no bridged ports in this VLAN, we're done */
if (group_count == 0)
goto no_ports_in_vlan;
err = ofdpa_group_l2_flood(ofdpa_port, flags, vlan_id,
group_count, group_ids, group_id);
if (err)
netdev_err(ofdpa_port->dev, "Error (%d) port VLAN l2 flood group\n", err);
no_ports_in_vlan:
kfree(group_ids);
return err;
}
static int ofdpa_port_vlan_l2_groups(struct ofdpa_port *ofdpa_port, int flags,
__be16 vlan_id, bool pop_vlan)
{
const struct ofdpa *ofdpa = ofdpa_port->ofdpa;
unsigned int port_count = ofdpa->rocker->port_count;
struct ofdpa_port *p;
bool adding = !(flags & OFDPA_OP_FLAG_REMOVE);
u32 out_pport;
int ref = 0;
int err;
int i;
/* An L2 interface group for this port in this VLAN, but
* only when port STP state is LEARNING|FORWARDING.
*/
if (ofdpa_port->stp_state == BR_STATE_LEARNING ||
ofdpa_port->stp_state == BR_STATE_FORWARDING) {
out_pport = ofdpa_port->pport;
err = ofdpa_group_l2_interface(ofdpa_port, flags,
vlan_id, out_pport, pop_vlan);
if (err) {
netdev_err(ofdpa_port->dev, "Error (%d) port VLAN l2 group for pport %d\n",
err, out_pport);
return err;
}
}
/* An L2 interface group for this VLAN to CPU port.
* Add when first port joins this VLAN and destroy when
* last port leaves this VLAN.
*/
for (i = 0; i < port_count; i++) {
p = ofdpa_port_get(ofdpa, i);
if (p && test_bit(ntohs(vlan_id), p->vlan_bitmap))
ref++;
}
if ((!adding || ref != 1) && (adding || ref != 0))
return 0;
out_pport = 0;
err = ofdpa_group_l2_interface(ofdpa_port, flags,
vlan_id, out_pport, pop_vlan);
if (err) {
netdev_err(ofdpa_port->dev, "Error (%d) port VLAN l2 group for CPU port\n", err);
return err;
}
return 0;
}
static struct ofdpa_ctrl {
const u8 *eth_dst;
const u8 *eth_dst_mask;
__be16 eth_type;
bool acl;
bool bridge;
bool term;
bool copy_to_cpu;
} ofdpa_ctrls[] = {
[OFDPA_CTRL_LINK_LOCAL_MCAST] = {
/* pass link local multicast pkts up to CPU for filtering */
.eth_dst = ll_mac,
.eth_dst_mask = ll_mask,
.acl = true,
},
[OFDPA_CTRL_LOCAL_ARP] = {
/* pass local ARP pkts up to CPU */
.eth_dst = zero_mac,
.eth_dst_mask = zero_mac,
.eth_type = htons(ETH_P_ARP),
.acl = true,
},
[OFDPA_CTRL_IPV4_MCAST] = {
/* pass IPv4 mcast pkts up to CPU, RFC 1112 */
.eth_dst = ipv4_mcast,
.eth_dst_mask = ipv4_mask,
.eth_type = htons(ETH_P_IP),
.term = true,
.copy_to_cpu = true,
},
[OFDPA_CTRL_IPV6_MCAST] = {
/* pass IPv6 mcast pkts up to CPU, RFC 2464 */
.eth_dst = ipv6_mcast,
.eth_dst_mask = ipv6_mask,
.eth_type = htons(ETH_P_IPV6),
.term = true,
.copy_to_cpu = true,
},
[OFDPA_CTRL_DFLT_BRIDGING] = {
/* flood any pkts on vlan */
.bridge = true,
.copy_to_cpu = true,
},
[OFDPA_CTRL_DFLT_OVS] = {
/* pass all pkts up to CPU */
.eth_dst = zero_mac,
.eth_dst_mask = zero_mac,
.acl = true,
},
};
static int ofdpa_port_ctrl_vlan_acl(struct ofdpa_port *ofdpa_port, int flags,
const struct ofdpa_ctrl *ctrl, __be16 vlan_id)
{
u32 in_pport = ofdpa_port->pport;
u32 in_pport_mask = 0xffffffff;
u32 out_pport = 0;
const u8 *eth_src = NULL;
const u8 *eth_src_mask = NULL;
__be16 vlan_id_mask = htons(0xffff);
u8 ip_proto = 0;
u8 ip_proto_mask = 0;
u8 ip_tos = 0;
u8 ip_tos_mask = 0;
u32 group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, out_pport);
int err;
err = ofdpa_flow_tbl_acl(ofdpa_port, flags,
in_pport, in_pport_mask,
eth_src, eth_src_mask,
ctrl->eth_dst, ctrl->eth_dst_mask,
ctrl->eth_type,
vlan_id, vlan_id_mask,
ip_proto, ip_proto_mask,
ip_tos, ip_tos_mask,
group_id);
if (err)
netdev_err(ofdpa_port->dev, "Error (%d) ctrl ACL\n", err);
return err;
}
static int ofdpa_port_ctrl_vlan_bridge(struct ofdpa_port *ofdpa_port,
int flags, const struct ofdpa_ctrl *ctrl,
__be16 vlan_id)
{
enum rocker_of_dpa_table_id goto_tbl =
ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
u32 group_id = ROCKER_GROUP_L2_FLOOD(vlan_id, 0);
u32 tunnel_id = 0;
int err;
if (!ofdpa_port_is_bridged(ofdpa_port))
return 0;
err = ofdpa_flow_tbl_bridge(ofdpa_port, flags,
ctrl->eth_dst, ctrl->eth_dst_mask,
vlan_id, tunnel_id,
goto_tbl, group_id, ctrl->copy_to_cpu);
if (err)
netdev_err(ofdpa_port->dev, "Error (%d) ctrl FLOOD\n", err);
return err;
}
static int ofdpa_port_ctrl_vlan_term(struct ofdpa_port *ofdpa_port, int flags,
const struct ofdpa_ctrl *ctrl, __be16 vlan_id)
{
u32 in_pport_mask = 0xffffffff;
__be16 vlan_id_mask = htons(0xffff);
int err;
if (ntohs(vlan_id) == 0)
vlan_id = ofdpa_port->internal_vlan_id;
err = ofdpa_flow_tbl_term_mac(ofdpa_port, ofdpa_port->pport, in_pport_mask,
ctrl->eth_type, ctrl->eth_dst,
ctrl->eth_dst_mask, vlan_id,
vlan_id_mask, ctrl->copy_to_cpu,
flags);
if (err)
netdev_err(ofdpa_port->dev, "Error (%d) ctrl term\n", err);
return err;
}
static int ofdpa_port_ctrl_vlan(struct ofdpa_port *ofdpa_port, int flags,
const struct ofdpa_ctrl *ctrl, __be16 vlan_id)
{
if (ctrl->acl)
return ofdpa_port_ctrl_vlan_acl(ofdpa_port, flags,
ctrl, vlan_id);
if (ctrl->bridge)
return ofdpa_port_ctrl_vlan_bridge(ofdpa_port, flags,
ctrl, vlan_id);
if (ctrl->term)
return ofdpa_port_ctrl_vlan_term(ofdpa_port, flags,
ctrl, vlan_id);
return -EOPNOTSUPP;
}
static int ofdpa_port_ctrl_vlan_add(struct ofdpa_port *ofdpa_port, int flags,
__be16 vlan_id)
{
int err = 0;
int i;
for (i = 0; i < OFDPA_CTRL_MAX; i++) {
if (ofdpa_port->ctrls[i]) {
err = ofdpa_port_ctrl_vlan(ofdpa_port, flags,
&ofdpa_ctrls[i], vlan_id);
if (err)
return err;
}
}
return err;
}
static int ofdpa_port_ctrl(struct ofdpa_port *ofdpa_port, int flags,
const struct ofdpa_ctrl *ctrl)
{
u16 vid;
int err = 0;
for (vid = 1; vid < VLAN_N_VID; vid++) {
if (!test_bit(vid, ofdpa_port->vlan_bitmap))
continue;
err = ofdpa_port_ctrl_vlan(ofdpa_port, flags,
ctrl, htons(vid));
if (err)
break;
}
return err;
}
static int ofdpa_port_vlan(struct ofdpa_port *ofdpa_port, int flags,
u16 vid)
{
enum rocker_of_dpa_table_id goto_tbl =
ROCKER_OF_DPA_TABLE_ID_TERMINATION_MAC;
u32 in_pport = ofdpa_port->pport;
__be16 vlan_id = htons(vid);
__be16 vlan_id_mask = htons(0xffff);
__be16 internal_vlan_id;
bool untagged;
bool adding = !(flags & OFDPA_OP_FLAG_REMOVE);
int err;
internal_vlan_id = ofdpa_port_vid_to_vlan(ofdpa_port, vid, &untagged);
if (adding &&
test_bit(ntohs(internal_vlan_id), ofdpa_port->vlan_bitmap))
return 0; /* already added */
else if (!adding &&
!test_bit(ntohs(internal_vlan_id), ofdpa_port->vlan_bitmap))
return 0; /* already removed */
change_bit(ntohs(internal_vlan_id), ofdpa_port->vlan_bitmap);
if (adding) {
err = ofdpa_port_ctrl_vlan_add(ofdpa_port, flags,
internal_vlan_id);
if (err) {
netdev_err(ofdpa_port->dev, "Error (%d) port ctrl vlan add\n", err);
goto err_vlan_add;
}
}
err = ofdpa_port_vlan_l2_groups(ofdpa_port, flags,
internal_vlan_id, untagged);
if (err) {
netdev_err(ofdpa_port->dev, "Error (%d) port VLAN l2 groups\n", err);
goto err_vlan_l2_groups;
}
err = ofdpa_port_vlan_flood_group(ofdpa_port, flags,
internal_vlan_id);
if (err) {
netdev_err(ofdpa_port->dev, "Error (%d) port VLAN l2 flood group\n", err);
goto err_flood_group;
}
err = ofdpa_flow_tbl_vlan(ofdpa_port, flags,
in_pport, vlan_id, vlan_id_mask,
goto_tbl, untagged, internal_vlan_id);
if (err)
netdev_err(ofdpa_port->dev, "Error (%d) port VLAN table\n", err);
return 0;
err_vlan_add:
err_vlan_l2_groups:
err_flood_group:
change_bit(ntohs(internal_vlan_id), ofdpa_port->vlan_bitmap);
return err;
}
static int ofdpa_port_ig_tbl(struct ofdpa_port *ofdpa_port, int flags)
{
enum rocker_of_dpa_table_id goto_tbl;
u32 in_pport;
u32 in_pport_mask;
int err;
/* Normal Ethernet Frames. Matches pkts from any local physical
* ports. Goto VLAN tbl.
*/
in_pport = 0;
in_pport_mask = 0xffff0000;
goto_tbl = ROCKER_OF_DPA_TABLE_ID_VLAN;
err = ofdpa_flow_tbl_ig_port(ofdpa_port, flags,
in_pport, in_pport_mask,
goto_tbl);
if (err)
netdev_err(ofdpa_port->dev, "Error (%d) ingress port table entry\n", err);
return err;
}
struct ofdpa_fdb_learn_work {
struct work_struct work;
struct ofdpa_port *ofdpa_port;
int flags;
u8 addr[ETH_ALEN];
u16 vid;
};
static void ofdpa_port_fdb_learn_work(struct work_struct *work)
{
const struct ofdpa_fdb_learn_work *lw =
container_of(work, struct ofdpa_fdb_learn_work, work);
bool removing = (lw->flags & OFDPA_OP_FLAG_REMOVE);
bool learned = (lw->flags & OFDPA_OP_FLAG_LEARNED);
struct switchdev_notifier_fdb_info info = {};
info.addr = lw->addr;
info.vid = lw->vid;
rtnl_lock();
if (learned && removing)
call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE,
lw->ofdpa_port->dev, &info.info, NULL);
else if (learned && !removing)
call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_BRIDGE,
lw->ofdpa_port->dev, &info.info, NULL);
rtnl_unlock();
kfree(work);
}
static int ofdpa_port_fdb_learn(struct ofdpa_port *ofdpa_port,
int flags, const u8 *addr, __be16 vlan_id)
{
struct ofdpa_fdb_learn_work *lw;
enum rocker_of_dpa_table_id goto_tbl =
ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
u32 out_pport = ofdpa_port->pport;
u32 tunnel_id = 0;
u32 group_id = ROCKER_GROUP_NONE;
bool copy_to_cpu = false;
int err;
if (ofdpa_port_is_bridged(ofdpa_port))
group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, out_pport);
if (!(flags & OFDPA_OP_FLAG_REFRESH)) {
err = ofdpa_flow_tbl_bridge(ofdpa_port, flags, addr,
NULL, vlan_id, tunnel_id, goto_tbl,
group_id, copy_to_cpu);
if (err)
return err;
}
if (!ofdpa_port_is_bridged(ofdpa_port))
return 0;
lw = kzalloc(sizeof(*lw), GFP_ATOMIC);
if (!lw)
return -ENOMEM;
INIT_WORK(&lw->work, ofdpa_port_fdb_learn_work);
lw->ofdpa_port = ofdpa_port;
lw->flags = flags;
ether_addr_copy(lw->addr, addr);
lw->vid = ofdpa_port_vlan_to_vid(ofdpa_port, vlan_id);
schedule_work(&lw->work);
return 0;
}
static struct ofdpa_fdb_tbl_entry *
ofdpa_fdb_tbl_find(const struct ofdpa *ofdpa,
const struct ofdpa_fdb_tbl_entry *match)
{
struct ofdpa_fdb_tbl_entry *found;
hash_for_each_possible(ofdpa->fdb_tbl, found, entry, match->key_crc32)
if (memcmp(&found->key, &match->key, sizeof(found->key)) == 0)
return found;
return NULL;
}
static int ofdpa_port_fdb(struct ofdpa_port *ofdpa_port,
const unsigned char *addr,
__be16 vlan_id, int flags)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_fdb_tbl_entry *fdb;
struct ofdpa_fdb_tbl_entry *found;
bool removing = (flags & OFDPA_OP_FLAG_REMOVE);
unsigned long lock_flags;
fdb = kzalloc(sizeof(*fdb), GFP_KERNEL);
if (!fdb)
return -ENOMEM;
fdb->learned = (flags & OFDPA_OP_FLAG_LEARNED);
fdb->touched = jiffies;
fdb->key.ofdpa_port = ofdpa_port;
ether_addr_copy(fdb->key.addr, addr);
fdb->key.vlan_id = vlan_id;
fdb->key_crc32 = crc32(~0, &fdb->key, sizeof(fdb->key));
spin_lock_irqsave(&ofdpa->fdb_tbl_lock, lock_flags);
found = ofdpa_fdb_tbl_find(ofdpa, fdb);
if (found) {
found->touched = jiffies;
if (removing) {
kfree(fdb);
hash_del(&found->entry);
}
} else if (!removing) {
hash_add(ofdpa->fdb_tbl, &fdb->entry,
fdb->key_crc32);
}
spin_unlock_irqrestore(&ofdpa->fdb_tbl_lock, lock_flags);
/* Check if adding and already exists, or removing and can't find */
if (!found != !removing) {
kfree(fdb);
if (!found && removing)
return 0;
/* Refreshing existing to update aging timers */
flags |= OFDPA_OP_FLAG_REFRESH;
}
return ofdpa_port_fdb_learn(ofdpa_port, flags, addr, vlan_id);
}
static int ofdpa_port_fdb_flush(struct ofdpa_port *ofdpa_port, int flags)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_fdb_tbl_entry *found;
unsigned long lock_flags;
struct hlist_node *tmp;
int bkt;
int err = 0;
if (ofdpa_port->stp_state == BR_STATE_LEARNING ||
ofdpa_port->stp_state == BR_STATE_FORWARDING)
return 0;
flags |= OFDPA_OP_FLAG_NOWAIT | OFDPA_OP_FLAG_REMOVE;
spin_lock_irqsave(&ofdpa->fdb_tbl_lock, lock_flags);
hash_for_each_safe(ofdpa->fdb_tbl, bkt, tmp, found, entry) {
if (found->key.ofdpa_port != ofdpa_port)
continue;
if (!found->learned)
continue;
err = ofdpa_port_fdb_learn(ofdpa_port, flags,
found->key.addr,
found->key.vlan_id);
if (err)
goto err_out;
hash_del(&found->entry);
}
err_out:
spin_unlock_irqrestore(&ofdpa->fdb_tbl_lock, lock_flags);
return err;
}
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
static void ofdpa_fdb_cleanup(struct timer_list *t)
{
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
struct ofdpa *ofdpa = from_timer(ofdpa, t, fdb_cleanup_timer);
struct ofdpa_port *ofdpa_port;
struct ofdpa_fdb_tbl_entry *entry;
struct hlist_node *tmp;
unsigned long next_timer = jiffies + ofdpa->ageing_time;
unsigned long expires;
unsigned long lock_flags;
int flags = OFDPA_OP_FLAG_NOWAIT | OFDPA_OP_FLAG_REMOVE |
OFDPA_OP_FLAG_LEARNED;
int bkt;
spin_lock_irqsave(&ofdpa->fdb_tbl_lock, lock_flags);
hash_for_each_safe(ofdpa->fdb_tbl, bkt, tmp, entry, entry) {
if (!entry->learned)
continue;
ofdpa_port = entry->key.ofdpa_port;
expires = entry->touched + ofdpa_port->ageing_time;
if (time_before_eq(expires, jiffies)) {
ofdpa_port_fdb_learn(ofdpa_port, flags,
entry->key.addr,
entry->key.vlan_id);
hash_del(&entry->entry);
} else if (time_before(expires, next_timer)) {
next_timer = expires;
}
}
spin_unlock_irqrestore(&ofdpa->fdb_tbl_lock, lock_flags);
mod_timer(&ofdpa->fdb_cleanup_timer, round_jiffies_up(next_timer));
}
static int ofdpa_port_router_mac(struct ofdpa_port *ofdpa_port,
int flags, __be16 vlan_id)
{
u32 in_pport_mask = 0xffffffff;
__be16 eth_type;
const u8 *dst_mac_mask = ff_mac;
__be16 vlan_id_mask = htons(0xffff);
bool copy_to_cpu = false;
int err;
if (ntohs(vlan_id) == 0)
vlan_id = ofdpa_port->internal_vlan_id;
eth_type = htons(ETH_P_IP);
err = ofdpa_flow_tbl_term_mac(ofdpa_port, ofdpa_port->pport,
in_pport_mask, eth_type,
ofdpa_port->dev->dev_addr,
dst_mac_mask, vlan_id, vlan_id_mask,
copy_to_cpu, flags);
if (err)
return err;
eth_type = htons(ETH_P_IPV6);
err = ofdpa_flow_tbl_term_mac(ofdpa_port, ofdpa_port->pport,
in_pport_mask, eth_type,
ofdpa_port->dev->dev_addr,
dst_mac_mask, vlan_id, vlan_id_mask,
copy_to_cpu, flags);
return err;
}
static int ofdpa_port_fwding(struct ofdpa_port *ofdpa_port, int flags)
{
bool pop_vlan;
u32 out_pport;
__be16 vlan_id;
u16 vid;
int err;
/* Port will be forwarding-enabled if its STP state is LEARNING
* or FORWARDING. Traffic from CPU can still egress, regardless of
* port STP state. Use L2 interface group on port VLANs as a way
* to toggle port forwarding: if forwarding is disabled, L2
* interface group will not exist.
*/
if (ofdpa_port->stp_state != BR_STATE_LEARNING &&
ofdpa_port->stp_state != BR_STATE_FORWARDING)
flags |= OFDPA_OP_FLAG_REMOVE;
out_pport = ofdpa_port->pport;
for (vid = 1; vid < VLAN_N_VID; vid++) {
if (!test_bit(vid, ofdpa_port->vlan_bitmap))
continue;
vlan_id = htons(vid);
pop_vlan = ofdpa_vlan_id_is_internal(vlan_id);
err = ofdpa_group_l2_interface(ofdpa_port, flags,
vlan_id, out_pport, pop_vlan);
if (err) {
netdev_err(ofdpa_port->dev, "Error (%d) port VLAN l2 group for pport %d\n",
err, out_pport);
return err;
}
}
return 0;
}
static int ofdpa_port_stp_update(struct ofdpa_port *ofdpa_port,
int flags, u8 state)
{
bool want[OFDPA_CTRL_MAX] = { 0, };
bool prev_ctrls[OFDPA_CTRL_MAX];
u8 prev_state;
int err;
int i;
memcpy(prev_ctrls, ofdpa_port->ctrls, sizeof(prev_ctrls));
prev_state = ofdpa_port->stp_state;
if (ofdpa_port->stp_state == state)
return 0;
ofdpa_port->stp_state = state;
switch (state) {
case BR_STATE_DISABLED:
/* port is completely disabled */
break;
case BR_STATE_LISTENING:
case BR_STATE_BLOCKING:
want[OFDPA_CTRL_LINK_LOCAL_MCAST] = true;
break;
case BR_STATE_LEARNING:
case BR_STATE_FORWARDING:
if (!ofdpa_port_is_ovsed(ofdpa_port))
want[OFDPA_CTRL_LINK_LOCAL_MCAST] = true;
want[OFDPA_CTRL_IPV4_MCAST] = true;
want[OFDPA_CTRL_IPV6_MCAST] = true;
if (ofdpa_port_is_bridged(ofdpa_port))
want[OFDPA_CTRL_DFLT_BRIDGING] = true;
else if (ofdpa_port_is_ovsed(ofdpa_port))
want[OFDPA_CTRL_DFLT_OVS] = true;
else
want[OFDPA_CTRL_LOCAL_ARP] = true;
break;
}
for (i = 0; i < OFDPA_CTRL_MAX; i++) {
if (want[i] != ofdpa_port->ctrls[i]) {
int ctrl_flags = flags |
(want[i] ? 0 : OFDPA_OP_FLAG_REMOVE);
err = ofdpa_port_ctrl(ofdpa_port, ctrl_flags,
&ofdpa_ctrls[i]);
if (err)
goto err_port_ctrl;
ofdpa_port->ctrls[i] = want[i];
}
}
err = ofdpa_port_fdb_flush(ofdpa_port, flags);
if (err)
goto err_fdb_flush;
err = ofdpa_port_fwding(ofdpa_port, flags);
if (err)
goto err_port_fwding;
return 0;
err_port_ctrl:
err_fdb_flush:
err_port_fwding:
memcpy(ofdpa_port->ctrls, prev_ctrls, sizeof(prev_ctrls));
ofdpa_port->stp_state = prev_state;
return err;
}
static int ofdpa_port_fwd_enable(struct ofdpa_port *ofdpa_port, int flags)
{
if (ofdpa_port_is_bridged(ofdpa_port))
/* bridge STP will enable port */
return 0;
/* port is not bridged, so simulate going to FORWARDING state */
return ofdpa_port_stp_update(ofdpa_port, flags,
BR_STATE_FORWARDING);
}
static int ofdpa_port_fwd_disable(struct ofdpa_port *ofdpa_port, int flags)
{
if (ofdpa_port_is_bridged(ofdpa_port))
/* bridge STP will disable port */
return 0;
/* port is not bridged, so simulate going to DISABLED state */
return ofdpa_port_stp_update(ofdpa_port, flags,
BR_STATE_DISABLED);
}
static int ofdpa_port_vlan_add(struct ofdpa_port *ofdpa_port,
u16 vid, u16 flags)
{
int err;
/* XXX deal with flags for PVID and untagged */
err = ofdpa_port_vlan(ofdpa_port, 0, vid);
if (err)
return err;
err = ofdpa_port_router_mac(ofdpa_port, 0, htons(vid));
if (err)
ofdpa_port_vlan(ofdpa_port,
OFDPA_OP_FLAG_REMOVE, vid);
return err;
}
static int ofdpa_port_vlan_del(struct ofdpa_port *ofdpa_port,
u16 vid, u16 flags)
{
int err;
err = ofdpa_port_router_mac(ofdpa_port, OFDPA_OP_FLAG_REMOVE,
htons(vid));
if (err)
return err;
return ofdpa_port_vlan(ofdpa_port, OFDPA_OP_FLAG_REMOVE,
vid);
}
static struct ofdpa_internal_vlan_tbl_entry *
ofdpa_internal_vlan_tbl_find(const struct ofdpa *ofdpa, int ifindex)
{
struct ofdpa_internal_vlan_tbl_entry *found;
hash_for_each_possible(ofdpa->internal_vlan_tbl, found,
entry, ifindex) {
if (found->ifindex == ifindex)
return found;
}
return NULL;
}
static __be16 ofdpa_port_internal_vlan_id_get(struct ofdpa_port *ofdpa_port,
int ifindex)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_internal_vlan_tbl_entry *entry;
struct ofdpa_internal_vlan_tbl_entry *found;
unsigned long lock_flags;
int i;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return 0;
entry->ifindex = ifindex;
spin_lock_irqsave(&ofdpa->internal_vlan_tbl_lock, lock_flags);
found = ofdpa_internal_vlan_tbl_find(ofdpa, ifindex);
if (found) {
kfree(entry);
goto found;
}
found = entry;
hash_add(ofdpa->internal_vlan_tbl, &found->entry, found->ifindex);
for (i = 0; i < OFDPA_N_INTERNAL_VLANS; i++) {
if (test_and_set_bit(i, ofdpa->internal_vlan_bitmap))
continue;
found->vlan_id = htons(OFDPA_INTERNAL_VLAN_ID_BASE + i);
goto found;
}
netdev_err(ofdpa_port->dev, "Out of internal VLAN IDs\n");
found:
found->ref_count++;
spin_unlock_irqrestore(&ofdpa->internal_vlan_tbl_lock, lock_flags);
return found->vlan_id;
}
static int ofdpa_port_fib_ipv4(struct ofdpa_port *ofdpa_port, __be32 dst,
int dst_len, struct fib_info *fi, u32 tb_id,
int flags)
{
const struct fib_nh *nh;
__be16 eth_type = htons(ETH_P_IP);
__be32 dst_mask = inet_make_mask(dst_len);
__be16 internal_vlan_id = ofdpa_port->internal_vlan_id;
u32 priority = fi->fib_priority;
enum rocker_of_dpa_table_id goto_tbl =
ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
u32 group_id;
bool nh_on_port;
bool has_gw;
u32 index;
int err;
/* XXX support ECMP */
nh = fib_info_nh(fi, 0);
nh_on_port = (nh->fib_nh_dev == ofdpa_port->dev);
has_gw = !!nh->fib_nh_gw4;
if (has_gw && nh_on_port) {
err = ofdpa_port_ipv4_nh(ofdpa_port, flags,
nh->fib_nh_gw4, &index);
if (err)
return err;
group_id = ROCKER_GROUP_L3_UNICAST(index);
} else {
/* Send to CPU for processing */
group_id = ROCKER_GROUP_L2_INTERFACE(internal_vlan_id, 0);
}
err = ofdpa_flow_tbl_ucast4_routing(ofdpa_port, eth_type, dst,
dst_mask, priority, goto_tbl,
group_id, fi, flags);
if (err)
netdev_err(ofdpa_port->dev, "Error (%d) IPv4 route %pI4\n",
err, &dst);
return err;
}
static void
ofdpa_port_internal_vlan_id_put(const struct ofdpa_port *ofdpa_port,
int ifindex)
{
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
struct ofdpa_internal_vlan_tbl_entry *found;
unsigned long lock_flags;
unsigned long bit;
spin_lock_irqsave(&ofdpa->internal_vlan_tbl_lock, lock_flags);
found = ofdpa_internal_vlan_tbl_find(ofdpa, ifindex);
if (!found) {
netdev_err(ofdpa_port->dev,
"ifindex (%d) not found in internal VLAN tbl\n",
ifindex);
goto not_found;
}
if (--found->ref_count <= 0) {
bit = ntohs(found->vlan_id) - OFDPA_INTERNAL_VLAN_ID_BASE;
clear_bit(bit, ofdpa->internal_vlan_bitmap);
hash_del(&found->entry);
kfree(found);
}
not_found:
spin_unlock_irqrestore(&ofdpa->internal_vlan_tbl_lock, lock_flags);
}
/**********************************
* Rocker world ops implementation
**********************************/
static int ofdpa_init(struct rocker *rocker)
{
struct ofdpa *ofdpa = rocker->wpriv;
ofdpa->rocker = rocker;
hash_init(ofdpa->flow_tbl);
spin_lock_init(&ofdpa->flow_tbl_lock);
hash_init(ofdpa->group_tbl);
spin_lock_init(&ofdpa->group_tbl_lock);
hash_init(ofdpa->fdb_tbl);
spin_lock_init(&ofdpa->fdb_tbl_lock);
hash_init(ofdpa->internal_vlan_tbl);
spin_lock_init(&ofdpa->internal_vlan_tbl_lock);
hash_init(ofdpa->neigh_tbl);
spin_lock_init(&ofdpa->neigh_tbl_lock);
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
timer_setup(&ofdpa->fdb_cleanup_timer, ofdpa_fdb_cleanup, 0);
mod_timer(&ofdpa->fdb_cleanup_timer, jiffies);
ofdpa->ageing_time = BR_DEFAULT_AGEING_TIME;
return 0;
}
static void ofdpa_fini(struct rocker *rocker)
{
struct ofdpa *ofdpa = rocker->wpriv;
unsigned long flags;
struct ofdpa_flow_tbl_entry *flow_entry;
struct ofdpa_group_tbl_entry *group_entry;
struct ofdpa_fdb_tbl_entry *fdb_entry;
struct ofdpa_internal_vlan_tbl_entry *internal_vlan_entry;
struct ofdpa_neigh_tbl_entry *neigh_entry;
struct hlist_node *tmp;
int bkt;
del_timer_sync(&ofdpa->fdb_cleanup_timer);
flush_workqueue(rocker->rocker_owq);
spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags);
hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry)
hash_del(&flow_entry->entry);
spin_unlock_irqrestore(&ofdpa->flow_tbl_lock, flags);
spin_lock_irqsave(&ofdpa->group_tbl_lock, flags);
hash_for_each_safe(ofdpa->group_tbl, bkt, tmp, group_entry, entry)
hash_del(&group_entry->entry);
spin_unlock_irqrestore(&ofdpa->group_tbl_lock, flags);
spin_lock_irqsave(&ofdpa->fdb_tbl_lock, flags);
hash_for_each_safe(ofdpa->fdb_tbl, bkt, tmp, fdb_entry, entry)
hash_del(&fdb_entry->entry);
spin_unlock_irqrestore(&ofdpa->fdb_tbl_lock, flags);
spin_lock_irqsave(&ofdpa->internal_vlan_tbl_lock, flags);
hash_for_each_safe(ofdpa->internal_vlan_tbl, bkt,
tmp, internal_vlan_entry, entry)
hash_del(&internal_vlan_entry->entry);
spin_unlock_irqrestore(&ofdpa->internal_vlan_tbl_lock, flags);
spin_lock_irqsave(&ofdpa->neigh_tbl_lock, flags);
hash_for_each_safe(ofdpa->neigh_tbl, bkt, tmp, neigh_entry, entry)
hash_del(&neigh_entry->entry);
spin_unlock_irqrestore(&ofdpa->neigh_tbl_lock, flags);
}
static int ofdpa_port_pre_init(struct rocker_port *rocker_port)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
ofdpa_port->ofdpa = rocker_port->rocker->wpriv;
ofdpa_port->rocker_port = rocker_port;
ofdpa_port->dev = rocker_port->dev;
ofdpa_port->pport = rocker_port->pport;
ofdpa_port->brport_flags = BR_LEARNING;
ofdpa_port->ageing_time = BR_DEFAULT_AGEING_TIME;
return 0;
}
static int ofdpa_port_init(struct rocker_port *rocker_port)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
int err;
rocker_port_set_learning(rocker_port,
!!(ofdpa_port->brport_flags & BR_LEARNING));
err = ofdpa_port_ig_tbl(ofdpa_port, 0);
if (err) {
netdev_err(ofdpa_port->dev, "install ig port table failed\n");
return err;
}
ofdpa_port->internal_vlan_id =
ofdpa_port_internal_vlan_id_get(ofdpa_port,
ofdpa_port->dev->ifindex);
err = ofdpa_port_vlan_add(ofdpa_port, OFDPA_UNTAGGED_VID, 0);
if (err) {
netdev_err(ofdpa_port->dev, "install untagged VLAN failed\n");
goto err_untagged_vlan;
}
return 0;
err_untagged_vlan:
ofdpa_port_ig_tbl(ofdpa_port, OFDPA_OP_FLAG_REMOVE);
return err;
}
static void ofdpa_port_fini(struct rocker_port *rocker_port)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
ofdpa_port_ig_tbl(ofdpa_port, OFDPA_OP_FLAG_REMOVE);
}
static int ofdpa_port_open(struct rocker_port *rocker_port)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
return ofdpa_port_fwd_enable(ofdpa_port, 0);
}
static void ofdpa_port_stop(struct rocker_port *rocker_port)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
ofdpa_port_fwd_disable(ofdpa_port, OFDPA_OP_FLAG_NOWAIT);
}
static int ofdpa_port_attr_stp_state_set(struct rocker_port *rocker_port,
u8 state)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
return ofdpa_port_stp_update(ofdpa_port, 0, state);
}
static int ofdpa_port_attr_bridge_flags_set(struct rocker_port *rocker_port,
net: switchdev: remove the transaction structure from port attributes Since the introduction of the switchdev API, port attributes were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port attribute notifier structures, and converts drivers to not look at this member. In part, this patch contains a revert of my previous commit 2e554a7a5d8a ("net: dsa: propagate switchdev vlan_filtering prepare phase to drivers"). For the most part, the conversion was trivial except for: - Rocker's world implementation based on Broadcom OF-DPA had an odd implementation of ofdpa_port_attr_bridge_flags_set. The conversion was done mechanically, by pasting the implementation twice, then only keeping the code that would get executed during prepare phase on top, then only keeping the code that gets executed during the commit phase on bottom, then simplifying the resulting code until this was obtained. - DSA's offloading of STP state, bridge flags, VLAN filtering and multicast router could be converted right away. But the ageing time could not, so a shim was introduced and this was left for a further commit. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Linus Walleij <linus.walleij@linaro.org> Acked-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Reviewed-by: Linus Walleij <linus.walleij@linaro.org> # RTL8366RB Reviewed-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 02:01:50 +02:00
unsigned long brport_flags)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
unsigned long orig_flags;
int err = 0;
orig_flags = ofdpa_port->brport_flags;
ofdpa_port->brport_flags = brport_flags;
net: switchdev: remove the transaction structure from port attributes Since the introduction of the switchdev API, port attributes were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port attribute notifier structures, and converts drivers to not look at this member. In part, this patch contains a revert of my previous commit 2e554a7a5d8a ("net: dsa: propagate switchdev vlan_filtering prepare phase to drivers"). For the most part, the conversion was trivial except for: - Rocker's world implementation based on Broadcom OF-DPA had an odd implementation of ofdpa_port_attr_bridge_flags_set. The conversion was done mechanically, by pasting the implementation twice, then only keeping the code that would get executed during prepare phase on top, then only keeping the code that gets executed during the commit phase on bottom, then simplifying the resulting code until this was obtained. - DSA's offloading of STP state, bridge flags, VLAN filtering and multicast router could be converted right away. But the ageing time could not, so a shim was introduced and this was left for a further commit. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Linus Walleij <linus.walleij@linaro.org> Acked-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Reviewed-by: Linus Walleij <linus.walleij@linaro.org> # RTL8366RB Reviewed-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 02:01:50 +02:00
if ((orig_flags ^ ofdpa_port->brport_flags) & BR_LEARNING)
err = rocker_port_set_learning(ofdpa_port->rocker_port,
!!(ofdpa_port->brport_flags & BR_LEARNING));
return err;
}
static int
ofdpa_port_attr_bridge_flags_support_get(const struct rocker_port *
rocker_port,
unsigned long *
p_brport_flags_support)
{
*p_brport_flags_support = BR_LEARNING;
return 0;
}
static int
ofdpa_port_attr_bridge_ageing_time_set(struct rocker_port *rocker_port,
net: switchdev: remove the transaction structure from port attributes Since the introduction of the switchdev API, port attributes were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port attribute notifier structures, and converts drivers to not look at this member. In part, this patch contains a revert of my previous commit 2e554a7a5d8a ("net: dsa: propagate switchdev vlan_filtering prepare phase to drivers"). For the most part, the conversion was trivial except for: - Rocker's world implementation based on Broadcom OF-DPA had an odd implementation of ofdpa_port_attr_bridge_flags_set. The conversion was done mechanically, by pasting the implementation twice, then only keeping the code that would get executed during prepare phase on top, then only keeping the code that gets executed during the commit phase on bottom, then simplifying the resulting code until this was obtained. - DSA's offloading of STP state, bridge flags, VLAN filtering and multicast router could be converted right away. But the ageing time could not, so a shim was introduced and this was left for a further commit. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Linus Walleij <linus.walleij@linaro.org> Acked-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Reviewed-by: Linus Walleij <linus.walleij@linaro.org> # RTL8366RB Reviewed-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 02:01:50 +02:00
u32 ageing_time)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
struct ofdpa *ofdpa = ofdpa_port->ofdpa;
net: switchdev: remove the transaction structure from port attributes Since the introduction of the switchdev API, port attributes were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port attribute notifier structures, and converts drivers to not look at this member. In part, this patch contains a revert of my previous commit 2e554a7a5d8a ("net: dsa: propagate switchdev vlan_filtering prepare phase to drivers"). For the most part, the conversion was trivial except for: - Rocker's world implementation based on Broadcom OF-DPA had an odd implementation of ofdpa_port_attr_bridge_flags_set. The conversion was done mechanically, by pasting the implementation twice, then only keeping the code that would get executed during prepare phase on top, then only keeping the code that gets executed during the commit phase on bottom, then simplifying the resulting code until this was obtained. - DSA's offloading of STP state, bridge flags, VLAN filtering and multicast router could be converted right away. But the ageing time could not, so a shim was introduced and this was left for a further commit. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Linus Walleij <linus.walleij@linaro.org> Acked-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Reviewed-by: Linus Walleij <linus.walleij@linaro.org> # RTL8366RB Reviewed-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 02:01:50 +02:00
ofdpa_port->ageing_time = clock_t_to_jiffies(ageing_time);
if (ofdpa_port->ageing_time < ofdpa->ageing_time)
ofdpa->ageing_time = ofdpa_port->ageing_time;
mod_timer(&ofdpa_port->ofdpa->fdb_cleanup_timer, jiffies);
return 0;
}
static int ofdpa_port_obj_vlan_add(struct rocker_port *rocker_port,
const struct switchdev_obj_port_vlan *vlan)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
net: switchdev: remove vid_begin -> vid_end range from VLAN objects The call path of a switchdev VLAN addition to the bridge looks something like this today: nbp_vlan_init | __br_vlan_set_default_pvid | | | | | br_afspec | | | | | | | v | | | br_process_vlan_info | | | | | | | v | | | br_vlan_info | | | / \ / | | / \ / | | / \ / | | / \ / v v v v v nbp_vlan_add br_vlan_add ------+ | ^ ^ | | | / | | | | / / / | \ br_vlan_get_master/ / v \ ^ / / br_vlan_add_existing \ | / / | \ | / / / \ | / / / \ | / / / \ | / / / v | | v / __vlan_add / / | / / | / v | / __vlan_vid_add | / \ | / v v v br_switchdev_port_vlan_add The ranges UAPI was introduced to the bridge in commit bdced7ef7838 ("bridge: support for multiple vlans and vlan ranges in setlink and dellink requests") (Jan 10 2015). But the VLAN ranges (parsed in br_afspec) have always been passed one by one, through struct bridge_vlan_info tmp_vinfo, to br_vlan_info. So the range never went too far in depth. Then Scott Feldman introduced the switchdev_port_bridge_setlink function in commit 47f8328bb1a4 ("switchdev: add new switchdev bridge setlink"). That marked the introduction of the SWITCHDEV_OBJ_PORT_VLAN, which made full use of the range. But switchdev_port_bridge_setlink was called like this: br_setlink -> br_afspec -> switchdev_port_bridge_setlink Basically, the switchdev and the bridge code were not tightly integrated. Then commit 41c498b9359e ("bridge: restore br_setlink back to original") came, and switchdev drivers were required to implement .ndo_bridge_setlink = switchdev_port_bridge_setlink for a while. In the meantime, commits such as 0944d6b5a2fa ("bridge: try switchdev op first in __vlan_vid_add/del") finally made switchdev penetrate the br_vlan_info() barrier and start to develop the call path we have today. But remember, br_vlan_info() still receives VLANs one by one. Then Arkadi Sharshevsky refactored the switchdev API in 2017 in commit 29ab586c3d83 ("net: switchdev: Remove bridge bypass support from switchdev") so that drivers would not implement .ndo_bridge_setlink any longer. The switchdev_port_bridge_setlink also got deleted. This refactoring removed the parallel bridge_setlink implementation from switchdev, and left the only switchdev VLAN objects to be the ones offloaded from __vlan_vid_add (basically RX filtering) and __vlan_add (the latter coming from commit 9c86ce2c1ae3 ("net: bridge: Notify about bridge VLANs")). That is to say, today the switchdev VLAN object ranges are not used in the kernel. Refactoring the above call path is a bit complicated, when the bridge VLAN call path is already a bit complicated. Let's go off and finish the job of commit 29ab586c3d83 by deleting the bogus iteration through the VLAN ranges from the drivers. Some aspects of this feature never made too much sense in the first place. For example, what is a range of VLANs all having the BRIDGE_VLAN_INFO_PVID flag supposed to mean, when a port can obviously have a single pvid? This particular configuration _is_ denied as of commit 6623c60dc28e ("bridge: vlan: enforce no pvid flag in vlan ranges"), but from an API perspective, the driver still has to play pretend, and only offload the vlan->vid_end as pvid. And the addition of a switchdev VLAN object can modify the flags of another, completely unrelated, switchdev VLAN object! (a VLAN that is PVID will invalidate the PVID flag from whatever other VLAN had previously been offloaded with switchdev and had that flag. Yet switchdev never notifies about that change, drivers are supposed to guess). Nonetheless, having a VLAN range in the API makes error handling look scarier than it really is - unwinding on errors and all of that. When in reality, no one really calls this API with more than one VLAN. It is all unnecessary complexity. And despite appearing pretentious (two-phase transactional model and all), the switchdev API is really sloppy because the VLAN addition and removal operations are not paired with one another (you can add a VLAN 100 times and delete it just once). The bridge notifies through switchdev of a VLAN addition not only when the flags of an existing VLAN change, but also when nothing changes. There are switchdev drivers out there who don't like adding a VLAN that has already been added, and those checks don't really belong at driver level. But the fact that the API contains ranges is yet another factor that prevents this from being addressed in the future. Of the existing switchdev pieces of hardware, it appears that only Mellanox Spectrum supports offloading more than one VLAN at a time, through mlxsw_sp_port_vlan_set. I have kept that code internal to the driver, because there is some more bookkeeping that makes use of it, but I deleted it from the switchdev API. But since the switchdev support for ranges has already been de facto deleted by a Mellanox employee and nobody noticed for 4 years, I'm going to assume it's not a biggie. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Ido Schimmel <idosch@nvidia.com> # switchdev and mlxsw Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 02:01:46 +02:00
return ofdpa_port_vlan_add(ofdpa_port, vlan->vid, vlan->flags);
}
static int ofdpa_port_obj_vlan_del(struct rocker_port *rocker_port,
const struct switchdev_obj_port_vlan *vlan)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
net: switchdev: remove vid_begin -> vid_end range from VLAN objects The call path of a switchdev VLAN addition to the bridge looks something like this today: nbp_vlan_init | __br_vlan_set_default_pvid | | | | | br_afspec | | | | | | | v | | | br_process_vlan_info | | | | | | | v | | | br_vlan_info | | | / \ / | | / \ / | | / \ / | | / \ / v v v v v nbp_vlan_add br_vlan_add ------+ | ^ ^ | | | / | | | | / / / | \ br_vlan_get_master/ / v \ ^ / / br_vlan_add_existing \ | / / | \ | / / / \ | / / / \ | / / / \ | / / / v | | v / __vlan_add / / | / / | / v | / __vlan_vid_add | / \ | / v v v br_switchdev_port_vlan_add The ranges UAPI was introduced to the bridge in commit bdced7ef7838 ("bridge: support for multiple vlans and vlan ranges in setlink and dellink requests") (Jan 10 2015). But the VLAN ranges (parsed in br_afspec) have always been passed one by one, through struct bridge_vlan_info tmp_vinfo, to br_vlan_info. So the range never went too far in depth. Then Scott Feldman introduced the switchdev_port_bridge_setlink function in commit 47f8328bb1a4 ("switchdev: add new switchdev bridge setlink"). That marked the introduction of the SWITCHDEV_OBJ_PORT_VLAN, which made full use of the range. But switchdev_port_bridge_setlink was called like this: br_setlink -> br_afspec -> switchdev_port_bridge_setlink Basically, the switchdev and the bridge code were not tightly integrated. Then commit 41c498b9359e ("bridge: restore br_setlink back to original") came, and switchdev drivers were required to implement .ndo_bridge_setlink = switchdev_port_bridge_setlink for a while. In the meantime, commits such as 0944d6b5a2fa ("bridge: try switchdev op first in __vlan_vid_add/del") finally made switchdev penetrate the br_vlan_info() barrier and start to develop the call path we have today. But remember, br_vlan_info() still receives VLANs one by one. Then Arkadi Sharshevsky refactored the switchdev API in 2017 in commit 29ab586c3d83 ("net: switchdev: Remove bridge bypass support from switchdev") so that drivers would not implement .ndo_bridge_setlink any longer. The switchdev_port_bridge_setlink also got deleted. This refactoring removed the parallel bridge_setlink implementation from switchdev, and left the only switchdev VLAN objects to be the ones offloaded from __vlan_vid_add (basically RX filtering) and __vlan_add (the latter coming from commit 9c86ce2c1ae3 ("net: bridge: Notify about bridge VLANs")). That is to say, today the switchdev VLAN object ranges are not used in the kernel. Refactoring the above call path is a bit complicated, when the bridge VLAN call path is already a bit complicated. Let's go off and finish the job of commit 29ab586c3d83 by deleting the bogus iteration through the VLAN ranges from the drivers. Some aspects of this feature never made too much sense in the first place. For example, what is a range of VLANs all having the BRIDGE_VLAN_INFO_PVID flag supposed to mean, when a port can obviously have a single pvid? This particular configuration _is_ denied as of commit 6623c60dc28e ("bridge: vlan: enforce no pvid flag in vlan ranges"), but from an API perspective, the driver still has to play pretend, and only offload the vlan->vid_end as pvid. And the addition of a switchdev VLAN object can modify the flags of another, completely unrelated, switchdev VLAN object! (a VLAN that is PVID will invalidate the PVID flag from whatever other VLAN had previously been offloaded with switchdev and had that flag. Yet switchdev never notifies about that change, drivers are supposed to guess). Nonetheless, having a VLAN range in the API makes error handling look scarier than it really is - unwinding on errors and all of that. When in reality, no one really calls this API with more than one VLAN. It is all unnecessary complexity. And despite appearing pretentious (two-phase transactional model and all), the switchdev API is really sloppy because the VLAN addition and removal operations are not paired with one another (you can add a VLAN 100 times and delete it just once). The bridge notifies through switchdev of a VLAN addition not only when the flags of an existing VLAN change, but also when nothing changes. There are switchdev drivers out there who don't like adding a VLAN that has already been added, and those checks don't really belong at driver level. But the fact that the API contains ranges is yet another factor that prevents this from being addressed in the future. Of the existing switchdev pieces of hardware, it appears that only Mellanox Spectrum supports offloading more than one VLAN at a time, through mlxsw_sp_port_vlan_set. I have kept that code internal to the driver, because there is some more bookkeeping that makes use of it, but I deleted it from the switchdev API. But since the switchdev support for ranges has already been de facto deleted by a Mellanox employee and nobody noticed for 4 years, I'm going to assume it's not a biggie. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Ido Schimmel <idosch@nvidia.com> # switchdev and mlxsw Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 02:01:46 +02:00
return ofdpa_port_vlan_del(ofdpa_port, vlan->vid, vlan->flags);
}
static int ofdpa_port_obj_fdb_add(struct rocker_port *rocker_port,
u16 vid, const unsigned char *addr)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
__be16 vlan_id = ofdpa_port_vid_to_vlan(ofdpa_port, vid, NULL);
if (!ofdpa_port_is_bridged(ofdpa_port))
return -EINVAL;
return ofdpa_port_fdb(ofdpa_port, addr, vlan_id, 0);
}
static int ofdpa_port_obj_fdb_del(struct rocker_port *rocker_port,
u16 vid, const unsigned char *addr)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
__be16 vlan_id = ofdpa_port_vid_to_vlan(ofdpa_port, vid, NULL);
int flags = OFDPA_OP_FLAG_REMOVE;
if (!ofdpa_port_is_bridged(ofdpa_port))
return -EINVAL;
return ofdpa_port_fdb(ofdpa_port, addr, vlan_id, flags);
}
static int ofdpa_port_bridge_join(struct ofdpa_port *ofdpa_port,
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
struct net_device *bridge,
struct netlink_ext_ack *extack)
{
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
struct net_device *dev = ofdpa_port->dev;
int err;
/* Port is joining bridge, so the internal VLAN for the
* port is going to change to the bridge internal VLAN.
* Let's remove untagged VLAN (vid=0) from port and
* re-add once internal VLAN has changed.
*/
err = ofdpa_port_vlan_del(ofdpa_port, OFDPA_UNTAGGED_VID, 0);
if (err)
return err;
ofdpa_port_internal_vlan_id_put(ofdpa_port,
ofdpa_port->dev->ifindex);
ofdpa_port->internal_vlan_id =
ofdpa_port_internal_vlan_id_get(ofdpa_port, bridge->ifindex);
ofdpa_port->bridge_dev = bridge;
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
err = ofdpa_port_vlan_add(ofdpa_port, OFDPA_UNTAGGED_VID, 0);
if (err)
return err;
net: bridge: move the switchdev object replay helpers to "push" mode Starting with commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"), DSA has introduced some bridge helpers that replay switchdev events (FDB/MDB/VLAN additions and deletions) that can be lost by the switchdev drivers in a variety of circumstances: - an IP multicast group was host-joined on the bridge itself before any switchdev port joined the bridge, leading to the host MDB entries missing in the hardware database. - during the bridge creation process, the MAC address of the bridge was added to the FDB as an entry pointing towards the bridge device itself, but with no switchdev ports being part of the bridge yet, this local FDB entry would remain unknown to the switchdev hardware database. - a VLAN/FDB/MDB was added to a bridge port that is a LAG interface, before any switchdev port joined that LAG, leading to the hardware database missing those entries. - a switchdev port left a LAG that is a bridge port, while the LAG remained part of the bridge, and all FDB/MDB/VLAN entries remained installed in the hardware database of the switchdev port. Also, since commit 0d2cfbd41c4a ("net: bridge: ignore switchdev events for LAG ports which didn't request replay"), DSA introduced a method, based on a const void *ctx, to ensure that two switchdev ports under the same LAG that is a bridge port do not see the same MDB/VLAN entry being replayed twice by the bridge, once for every bridge port that joins the LAG. With so many ordering corner cases being possible, it seems unreasonable to expect a switchdev driver writer to get it right from the first try. Therefore, now that DSA has experimented with the bridge replay helpers for a little bit, we can move the code to the bridge driver where it is more readily available to all switchdev drivers. To convert the switchdev object replay helpers from "pull mode" (where the driver asks for them) to a "push mode" (where the bridge offers them automatically), the biggest problem is that the bridge needs to be aware when a switchdev port joins and leaves, even when the switchdev is only indirectly a bridge port (for example when the bridge port is a LAG upper of the switchdev). Luckily, we already have a hook for that, in the form of the newly introduced switchdev_bridge_port_offload() and switchdev_bridge_port_unoffload() calls. These offer a natural place for hooking the object addition and deletion replays. Extend the above 2 functions with: - pointers to the switchdev atomic notifier (for FDB replays) and the blocking notifier (for MDB and VLAN replays). - the "const void *ctx" argument required for drivers to be able to disambiguate between which port is targeted, when multiple ports are lowers of the same LAG that is a bridge port. Most of the drivers pass NULL to this argument, except the ones that support LAG offload and have the proper context check already in place in the switchdev blocking notifier handler. Also unexport the replay helpers, since nobody except the bridge calls them directly now. Note that: (a) we abuse the terminology slightly, because FDB entries are not "switchdev objects", but we count them as objects nonetheless. With no direct way to prove it, I think they are not modeled as switchdev objects because those can only be installed by the bridge to the hardware (as opposed to FDB entries which can be propagated in the other direction too). This is merely an abuse of terms, FDB entries are replayed too, despite not being objects. (b) the bridge does not attempt to sync port attributes to newly joined ports, just the countable stuff (the objects). The reason for this is simple: no universal and symmetric way to sync and unsync them is known. For example, VLAN filtering: what to do on unsync, disable or leave it enabled? Similarly, STP state, ageing timer, etc etc. What a switchdev port does when it becomes standalone again is not really up to the bridge's competence, and the driver should deal with it. On the other hand, replaying deletions of switchdev objects can be seen a matter of cleanup and therefore be treated by the bridge, hence this patch. We make the replay helpers opt-in for drivers, because they might not bring immediate benefits for them: - nbp_vlan_init() is called _after_ netdev_master_upper_dev_link(), so br_vlan_replay() should not do anything for the new drivers on which we call it. The existing drivers where there was even a slight possibility for there to exist a VLAN on a bridge port before they join it are already guarded against this: mlxsw and prestera deny joining LAG interfaces that are members of a bridge. - br_fdb_replay() should now notify of local FDB entries, but I patched all drivers except DSA to ignore these new entries in commit 2c4eca3ef716 ("net: bridge: switchdev: include local flag in FDB notifications"). Driver authors can lift this restriction as they wish, and when they do, they can also opt into the FDB replay functionality. - br_mdb_replay() should fix a real issue which is described in commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"). However most drivers do not offload the SWITCHDEV_OBJ_ID_HOST_MDB to see this issue: only cpsw and am65_cpsw offload this switchdev object, and I don't completely understand the way in which they offload this switchdev object anyway. So I'll leave it up to these drivers' respective maintainers to opt into br_mdb_replay(). So most of the drivers pass NULL notifier blocks for the replay helpers, except: - dpaa2-switch which was already acked/regression-tested with the helpers enabled (and there isn't much of a downside in having them) - ocelot which already had replay logic in "pull" mode - DSA which already had replay logic in "pull" mode An important observation is that the drivers which don't currently request bridge event replays don't even have the switchdev_bridge_port_{offload,unoffload} calls placed in proper places right now. This was done to avoid unnecessary rework for drivers which might never even add support for this. For driver writers who wish to add replay support, this can be used as a tentative placement guide: https://patchwork.kernel.org/project/netdevbpf/patch/20210720134655.892334-11-vladimir.oltean@nxp.com/ Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:03 +03:00
return switchdev_bridge_port_offload(dev, dev, NULL, NULL, NULL,
net: bridge: switchdev: allow the TX data plane forwarding to be offloaded Allow switchdevs to forward frames from the CPU in accordance with the bridge configuration in the same way as is done between bridge ports. This means that the bridge will only send a single skb towards one of the ports under the switchdev's control, and expects the driver to deliver the packet to all eligible ports in its domain. Primarily this improves the performance of multicast flows with multiple subscribers, as it allows the hardware to perform the frame replication. The basic flow between the driver and the bridge is as follows: - When joining a bridge port, the switchdev driver calls switchdev_bridge_port_offload() with tx_fwd_offload = true. - The bridge sends offloadable skbs to one of the ports under the switchdev's control using skb->offload_fwd_mark = true. - The switchdev driver checks the skb->offload_fwd_mark field and lets its FDB lookup select the destination port mask for this packet. v1->v2: - convert br_input_skb_cb::fwd_hwdoms to a plain unsigned long - introduce a static key "br_switchdev_fwd_offload_used" to minimize the impact of the newly introduced feature on all the setups which don't have hardware that can make use of it - introduce a check for nbp->flags & BR_FWD_OFFLOAD to optimize cache line access - reorder nbp_switchdev_frame_mark_accel() and br_handle_vlan() in __br_forward() - do not strip VLAN on egress if forwarding offload on VLAN-aware bridge is being used - propagate errors from .ndo_dfwd_add_station() if not EOPNOTSUPP v2->v3: - replace the solution based on .ndo_dfwd_add_station with a solution based on switchdev_bridge_port_offload - rename BR_FWD_OFFLOAD to BR_TX_FWD_OFFLOAD v3->v4: rebase v4->v5: - make sure the static key is decremented on bridge port unoffload - more function and variable renaming and comments for them: br_switchdev_fwd_offload_used to br_switchdev_tx_fwd_offload br_switchdev_accels_skb to br_switchdev_frame_uses_tx_fwd_offload nbp_switchdev_frame_mark_tx_fwd to nbp_switchdev_frame_mark_tx_fwd_to_hwdom nbp_switchdev_frame_mark_accel to nbp_switchdev_frame_mark_tx_fwd_offload fwd_accel to tx_fwd_offload Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-22 18:55:38 +03:00
false, extack);
}
static int ofdpa_port_bridge_leave(struct ofdpa_port *ofdpa_port)
{
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
struct net_device *dev = ofdpa_port->dev;
int err;
net: bridge: move the switchdev object replay helpers to "push" mode Starting with commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"), DSA has introduced some bridge helpers that replay switchdev events (FDB/MDB/VLAN additions and deletions) that can be lost by the switchdev drivers in a variety of circumstances: - an IP multicast group was host-joined on the bridge itself before any switchdev port joined the bridge, leading to the host MDB entries missing in the hardware database. - during the bridge creation process, the MAC address of the bridge was added to the FDB as an entry pointing towards the bridge device itself, but with no switchdev ports being part of the bridge yet, this local FDB entry would remain unknown to the switchdev hardware database. - a VLAN/FDB/MDB was added to a bridge port that is a LAG interface, before any switchdev port joined that LAG, leading to the hardware database missing those entries. - a switchdev port left a LAG that is a bridge port, while the LAG remained part of the bridge, and all FDB/MDB/VLAN entries remained installed in the hardware database of the switchdev port. Also, since commit 0d2cfbd41c4a ("net: bridge: ignore switchdev events for LAG ports which didn't request replay"), DSA introduced a method, based on a const void *ctx, to ensure that two switchdev ports under the same LAG that is a bridge port do not see the same MDB/VLAN entry being replayed twice by the bridge, once for every bridge port that joins the LAG. With so many ordering corner cases being possible, it seems unreasonable to expect a switchdev driver writer to get it right from the first try. Therefore, now that DSA has experimented with the bridge replay helpers for a little bit, we can move the code to the bridge driver where it is more readily available to all switchdev drivers. To convert the switchdev object replay helpers from "pull mode" (where the driver asks for them) to a "push mode" (where the bridge offers them automatically), the biggest problem is that the bridge needs to be aware when a switchdev port joins and leaves, even when the switchdev is only indirectly a bridge port (for example when the bridge port is a LAG upper of the switchdev). Luckily, we already have a hook for that, in the form of the newly introduced switchdev_bridge_port_offload() and switchdev_bridge_port_unoffload() calls. These offer a natural place for hooking the object addition and deletion replays. Extend the above 2 functions with: - pointers to the switchdev atomic notifier (for FDB replays) and the blocking notifier (for MDB and VLAN replays). - the "const void *ctx" argument required for drivers to be able to disambiguate between which port is targeted, when multiple ports are lowers of the same LAG that is a bridge port. Most of the drivers pass NULL to this argument, except the ones that support LAG offload and have the proper context check already in place in the switchdev blocking notifier handler. Also unexport the replay helpers, since nobody except the bridge calls them directly now. Note that: (a) we abuse the terminology slightly, because FDB entries are not "switchdev objects", but we count them as objects nonetheless. With no direct way to prove it, I think they are not modeled as switchdev objects because those can only be installed by the bridge to the hardware (as opposed to FDB entries which can be propagated in the other direction too). This is merely an abuse of terms, FDB entries are replayed too, despite not being objects. (b) the bridge does not attempt to sync port attributes to newly joined ports, just the countable stuff (the objects). The reason for this is simple: no universal and symmetric way to sync and unsync them is known. For example, VLAN filtering: what to do on unsync, disable or leave it enabled? Similarly, STP state, ageing timer, etc etc. What a switchdev port does when it becomes standalone again is not really up to the bridge's competence, and the driver should deal with it. On the other hand, replaying deletions of switchdev objects can be seen a matter of cleanup and therefore be treated by the bridge, hence this patch. We make the replay helpers opt-in for drivers, because they might not bring immediate benefits for them: - nbp_vlan_init() is called _after_ netdev_master_upper_dev_link(), so br_vlan_replay() should not do anything for the new drivers on which we call it. The existing drivers where there was even a slight possibility for there to exist a VLAN on a bridge port before they join it are already guarded against this: mlxsw and prestera deny joining LAG interfaces that are members of a bridge. - br_fdb_replay() should now notify of local FDB entries, but I patched all drivers except DSA to ignore these new entries in commit 2c4eca3ef716 ("net: bridge: switchdev: include local flag in FDB notifications"). Driver authors can lift this restriction as they wish, and when they do, they can also opt into the FDB replay functionality. - br_mdb_replay() should fix a real issue which is described in commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"). However most drivers do not offload the SWITCHDEV_OBJ_ID_HOST_MDB to see this issue: only cpsw and am65_cpsw offload this switchdev object, and I don't completely understand the way in which they offload this switchdev object anyway. So I'll leave it up to these drivers' respective maintainers to opt into br_mdb_replay(). So most of the drivers pass NULL notifier blocks for the replay helpers, except: - dpaa2-switch which was already acked/regression-tested with the helpers enabled (and there isn't much of a downside in having them) - ocelot which already had replay logic in "pull" mode - DSA which already had replay logic in "pull" mode An important observation is that the drivers which don't currently request bridge event replays don't even have the switchdev_bridge_port_{offload,unoffload} calls placed in proper places right now. This was done to avoid unnecessary rework for drivers which might never even add support for this. For driver writers who wish to add replay support, this can be used as a tentative placement guide: https://patchwork.kernel.org/project/netdevbpf/patch/20210720134655.892334-11-vladimir.oltean@nxp.com/ Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:03 +03:00
switchdev_bridge_port_unoffload(dev, NULL, NULL, NULL);
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
err = ofdpa_port_vlan_del(ofdpa_port, OFDPA_UNTAGGED_VID, 0);
if (err)
return err;
ofdpa_port_internal_vlan_id_put(ofdpa_port,
ofdpa_port->bridge_dev->ifindex);
ofdpa_port->internal_vlan_id =
ofdpa_port_internal_vlan_id_get(ofdpa_port,
ofdpa_port->dev->ifindex);
ofdpa_port->bridge_dev = NULL;
err = ofdpa_port_vlan_add(ofdpa_port, OFDPA_UNTAGGED_VID, 0);
if (err)
return err;
if (ofdpa_port->dev->flags & IFF_UP)
err = ofdpa_port_fwd_enable(ofdpa_port, 0);
return err;
}
static int ofdpa_port_ovs_changed(struct ofdpa_port *ofdpa_port,
struct net_device *master)
{
int err;
ofdpa_port->bridge_dev = master;
err = ofdpa_port_fwd_disable(ofdpa_port, 0);
if (err)
return err;
err = ofdpa_port_fwd_enable(ofdpa_port, 0);
return err;
}
static int ofdpa_port_master_linked(struct rocker_port *rocker_port,
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
struct net_device *master,
struct netlink_ext_ack *extack)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
int err = 0;
if (netif_is_bridge_master(master))
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
err = ofdpa_port_bridge_join(ofdpa_port, master, extack);
else if (netif_is_ovs_master(master))
err = ofdpa_port_ovs_changed(ofdpa_port, master);
return err;
}
static int ofdpa_port_master_unlinked(struct rocker_port *rocker_port,
struct net_device *master)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
int err = 0;
if (ofdpa_port_is_bridged(ofdpa_port))
err = ofdpa_port_bridge_leave(ofdpa_port);
else if (ofdpa_port_is_ovsed(ofdpa_port))
err = ofdpa_port_ovs_changed(ofdpa_port, NULL);
return err;
}
static int ofdpa_port_neigh_update(struct rocker_port *rocker_port,
struct neighbour *n)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
int flags = (n->nud_state & NUD_VALID ? 0 : OFDPA_OP_FLAG_REMOVE) |
OFDPA_OP_FLAG_NOWAIT;
__be32 ip_addr = *(__be32 *) n->primary_key;
return ofdpa_port_ipv4_neigh(ofdpa_port, flags, ip_addr, n->ha);
}
static int ofdpa_port_neigh_destroy(struct rocker_port *rocker_port,
struct neighbour *n)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
int flags = OFDPA_OP_FLAG_REMOVE | OFDPA_OP_FLAG_NOWAIT;
__be32 ip_addr = *(__be32 *) n->primary_key;
return ofdpa_port_ipv4_neigh(ofdpa_port, flags, ip_addr, n->ha);
}
static int ofdpa_port_ev_mac_vlan_seen(struct rocker_port *rocker_port,
const unsigned char *addr,
__be16 vlan_id)
{
struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
int flags = OFDPA_OP_FLAG_NOWAIT | OFDPA_OP_FLAG_LEARNED;
if (ofdpa_port->stp_state != BR_STATE_LEARNING &&
ofdpa_port->stp_state != BR_STATE_FORWARDING)
return 0;
return ofdpa_port_fdb(ofdpa_port, addr, vlan_id, flags);
}
static struct ofdpa_port *ofdpa_port_dev_lower_find(struct net_device *dev,
struct rocker *rocker)
{
struct rocker_port *rocker_port;
rocker_port = rocker_port_dev_lower_find(dev, rocker);
return rocker_port ? rocker_port->wpriv : NULL;
}
static int ofdpa_fib4_add(struct rocker *rocker,
const struct fib_entry_notifier_info *fen_info)
{
struct ofdpa *ofdpa = rocker->wpriv;
struct ofdpa_port *ofdpa_port;
struct fib_nh *nh;
int err;
if (ofdpa->fib_aborted)
return 0;
nh = fib_info_nh(fen_info->fi, 0);
ofdpa_port = ofdpa_port_dev_lower_find(nh->fib_nh_dev, rocker);
if (!ofdpa_port)
return 0;
err = ofdpa_port_fib_ipv4(ofdpa_port, htonl(fen_info->dst),
fen_info->dst_len, fen_info->fi,
fen_info->tb_id, 0);
if (err)
return err;
nh->fib_nh_flags |= RTNH_F_OFFLOAD;
return 0;
}
static int ofdpa_fib4_del(struct rocker *rocker,
const struct fib_entry_notifier_info *fen_info)
{
struct ofdpa *ofdpa = rocker->wpriv;
struct ofdpa_port *ofdpa_port;
struct fib_nh *nh;
if (ofdpa->fib_aborted)
return 0;
nh = fib_info_nh(fen_info->fi, 0);
ofdpa_port = ofdpa_port_dev_lower_find(nh->fib_nh_dev, rocker);
if (!ofdpa_port)
return 0;
nh->fib_nh_flags &= ~RTNH_F_OFFLOAD;
return ofdpa_port_fib_ipv4(ofdpa_port, htonl(fen_info->dst),
fen_info->dst_len, fen_info->fi,
fen_info->tb_id, OFDPA_OP_FLAG_REMOVE);
}
static void ofdpa_fib4_abort(struct rocker *rocker)
{
struct ofdpa *ofdpa = rocker->wpriv;
struct ofdpa_port *ofdpa_port;
struct ofdpa_flow_tbl_entry *flow_entry;
struct hlist_node *tmp;
unsigned long flags;
int bkt;
if (ofdpa->fib_aborted)
return;
spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags);
hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry) {
struct fib_nh *nh;
if (flow_entry->key.tbl_id !=
ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING)
continue;
nh = fib_info_nh(flow_entry->fi, 0);
ofdpa_port = ofdpa_port_dev_lower_find(nh->fib_nh_dev, rocker);
if (!ofdpa_port)
continue;
nh->fib_nh_flags &= ~RTNH_F_OFFLOAD;
ofdpa_flow_tbl_del(ofdpa_port,
OFDPA_OP_FLAG_REMOVE | OFDPA_OP_FLAG_NOWAIT,
flow_entry);
}
spin_unlock_irqrestore(&ofdpa->flow_tbl_lock, flags);
ofdpa->fib_aborted = true;
}
struct rocker_world_ops rocker_ofdpa_ops = {
.kind = "ofdpa",
.priv_size = sizeof(struct ofdpa),
.port_priv_size = sizeof(struct ofdpa_port),
.mode = ROCKER_PORT_MODE_OF_DPA,
.init = ofdpa_init,
.fini = ofdpa_fini,
.port_pre_init = ofdpa_port_pre_init,
.port_init = ofdpa_port_init,
.port_fini = ofdpa_port_fini,
.port_open = ofdpa_port_open,
.port_stop = ofdpa_port_stop,
.port_attr_stp_state_set = ofdpa_port_attr_stp_state_set,
.port_attr_bridge_flags_set = ofdpa_port_attr_bridge_flags_set,
.port_attr_bridge_flags_support_get = ofdpa_port_attr_bridge_flags_support_get,
.port_attr_bridge_ageing_time_set = ofdpa_port_attr_bridge_ageing_time_set,
.port_obj_vlan_add = ofdpa_port_obj_vlan_add,
.port_obj_vlan_del = ofdpa_port_obj_vlan_del,
.port_obj_fdb_add = ofdpa_port_obj_fdb_add,
.port_obj_fdb_del = ofdpa_port_obj_fdb_del,
.port_master_linked = ofdpa_port_master_linked,
.port_master_unlinked = ofdpa_port_master_unlinked,
.port_neigh_update = ofdpa_port_neigh_update,
.port_neigh_destroy = ofdpa_port_neigh_destroy,
.port_ev_mac_vlan_seen = ofdpa_port_ev_mac_vlan_seen,
.fib4_add = ofdpa_fib4_add,
.fib4_del = ofdpa_fib4_del,
.fib4_abort = ofdpa_fib4_abort,
};