linux/net/bridge/br_fdb.c
John Fastabend 77162022ab net: add generic PF_BRIDGE:RTM_ FDB hooks
This adds two new flags NTF_MASTER and NTF_SELF that can
now be used to specify where PF_BRIDGE netlink commands should
be sent. NTF_MASTER sends the commands to the 'dev->master'
device for parsing. Typically this will be the linux net/bridge,
or open-vswitch devices. Also without any flags set the command
will be handled by the master device as well so that current user
space tools continue to work as expected.

The NTF_SELF flag will push the PF_BRIDGE commands to the
device. In the basic example below the commands are then parsed
and programmed in the embedded bridge.

Note if both NTF_SELF and NTF_MASTER bits are set then the
command will be sent to both 'dev->master' and 'dev' this allows
user space to easily keep the embedded bridge and software bridge
in sync.

There is a slight complication in the case with both flags set
when an error occurs. To resolve this the rtnl handler clears
the NTF_ flag in the netlink ack to indicate which sets completed
successfully. The add/del handlers will abort as soon as any
error occurs.

To support this new net device ops were added to call into
the device and the existing bridging code was refactored
to use these. There should be no required changes in user space
to support the current bridge behavior.

A basic setup with a SR-IOV enabled NIC looks like this,

          veth0  veth2
            |      |
          ------------
          |  bridge0 |   <---- software bridging
          ------------
               /
               /
  ethx.y      ethx
    VF         PF
     \         \          <---- propagate FDB entries to HW
     \         \
  --------------------
  |  Embedded Bridge |    <---- hardware offloaded switching
  --------------------

In this case the embedded bridge must be managed to allow 'veth0'
to communicate with 'ethx.y' correctly. At present drivers managing
the embedded bridge either send frames onto the network which
then get dropped by the switch OR the embedded bridge will flood
these frames. With this patch we have a mechanism to manage the
embedded bridge correctly from user space. This example is specific
to SR-IOV but replacing the VF with another PF or dropping this
into the DSA framework generates similar management issues.

Examples session using the 'br'[1] tool to add, dump and then
delete a mac address with a new "embedded" option and enabled
ixgbe driver:

# br fdb add 22:35:19:ac:60:59 dev eth3
# br fdb
port    mac addr                flags
veth0   22:35:19:ac:60:58       static
veth0   9a:5f:81:f7:f6:ec       local
eth3    00:1b:21:55:23:59       local
eth3    22:35:19:ac:60:59       static
veth0   22:35:19:ac:60:57       static
#br fdb add 22:35:19:ac:60:59 embedded dev eth3
#br fdb
port    mac addr                flags
veth0   22:35:19:ac:60:58       static
veth0   9a:5f:81:f7:f6:ec       local
eth3    00:1b:21:55:23:59       local
eth3    22:35:19:ac:60:59       static
veth0   22:35:19:ac:60:57       static
eth3    22:35:19:ac:60:59       local embedded
#br fdb del 22:35:19:ac:60:59 embedded dev eth3

I added a couple lines to 'br' to set the flags correctly is all. It
is my opinion that the merit of this patch is now embedded and SW
bridges can both be modeled correctly in user space using very nearly
the same message passing.

[1] 'br' tool was published as an RFC here and will be renamed 'bridge'
    http://patchwork.ozlabs.org/patch/117664/

Thanks to Jamal Hadi Salim, Stephen Hemminger and Ben Hutchings for
valuable feedback, suggestions, and review.

v2: fixed api descriptions and error case with both NTF_SELF and
    NTF_MASTER set plus updated patch description.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-15 13:06:04 -04:00

676 lines
16 KiB
C

/*
* Forwarding database
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/times.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/atomic.h>
#include <asm/unaligned.h>
#include "br_private.h"
static struct kmem_cache *br_fdb_cache __read_mostly;
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr);
static void fdb_notify(struct net_bridge *br,
const struct net_bridge_fdb_entry *, int);
static u32 fdb_salt __read_mostly;
int __init br_fdb_init(void)
{
br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
sizeof(struct net_bridge_fdb_entry),
0,
SLAB_HWCACHE_ALIGN, NULL);
if (!br_fdb_cache)
return -ENOMEM;
get_random_bytes(&fdb_salt, sizeof(fdb_salt));
return 0;
}
void br_fdb_fini(void)
{
kmem_cache_destroy(br_fdb_cache);
}
/* if topology_changing then use forward_delay (default 15 sec)
* otherwise keep longer (default 5 minutes)
*/
static inline unsigned long hold_time(const struct net_bridge *br)
{
return br->topology_change ? br->forward_delay : br->ageing_time;
}
static inline int has_expired(const struct net_bridge *br,
const struct net_bridge_fdb_entry *fdb)
{
return !fdb->is_static &&
time_before_eq(fdb->updated + hold_time(br), jiffies);
}
static inline int br_mac_hash(const unsigned char *mac)
{
/* use 1 byte of OUI cnd 3 bytes of NIC */
u32 key = get_unaligned((u32 *)(mac + 2));
return jhash_1word(key, fdb_salt) & (BR_HASH_SIZE - 1);
}
static void fdb_rcu_free(struct rcu_head *head)
{
struct net_bridge_fdb_entry *ent
= container_of(head, struct net_bridge_fdb_entry, rcu);
kmem_cache_free(br_fdb_cache, ent);
}
static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
{
hlist_del_rcu(&f->hlist);
fdb_notify(br, f, RTM_DELNEIGH);
call_rcu(&f->rcu, fdb_rcu_free);
}
void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
{
struct net_bridge *br = p->br;
int i;
spin_lock_bh(&br->hash_lock);
/* Search all chains since old address/hash is unknown */
for (i = 0; i < BR_HASH_SIZE; i++) {
struct hlist_node *h;
hlist_for_each(h, &br->hash[i]) {
struct net_bridge_fdb_entry *f;
f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
if (f->dst == p && f->is_local) {
/* maybe another port has same hw addr? */
struct net_bridge_port *op;
list_for_each_entry(op, &br->port_list, list) {
if (op != p &&
!compare_ether_addr(op->dev->dev_addr,
f->addr.addr)) {
f->dst = op;
goto insert;
}
}
/* delete old one */
fdb_delete(br, f);
goto insert;
}
}
}
insert:
/* insert new address, may fail if invalid address or dup. */
fdb_insert(br, p, newaddr);
spin_unlock_bh(&br->hash_lock);
}
void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
{
struct net_bridge_fdb_entry *f;
/* If old entry was unassociated with any port, then delete it. */
f = __br_fdb_get(br, br->dev->dev_addr);
if (f && f->is_local && !f->dst)
fdb_delete(br, f);
fdb_insert(br, NULL, newaddr);
}
void br_fdb_cleanup(unsigned long _data)
{
struct net_bridge *br = (struct net_bridge *)_data;
unsigned long delay = hold_time(br);
unsigned long next_timer = jiffies + br->ageing_time;
int i;
spin_lock(&br->hash_lock);
for (i = 0; i < BR_HASH_SIZE; i++) {
struct net_bridge_fdb_entry *f;
struct hlist_node *h, *n;
hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) {
unsigned long this_timer;
if (f->is_static)
continue;
this_timer = f->updated + delay;
if (time_before_eq(this_timer, jiffies))
fdb_delete(br, f);
else if (time_before(this_timer, next_timer))
next_timer = this_timer;
}
}
spin_unlock(&br->hash_lock);
mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
}
/* Completely flush all dynamic entries in forwarding database.*/
void br_fdb_flush(struct net_bridge *br)
{
int i;
spin_lock_bh(&br->hash_lock);
for (i = 0; i < BR_HASH_SIZE; i++) {
struct net_bridge_fdb_entry *f;
struct hlist_node *h, *n;
hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) {
if (!f->is_static)
fdb_delete(br, f);
}
}
spin_unlock_bh(&br->hash_lock);
}
/* Flush all entries referring to a specific port.
* if do_all is set also flush static entries
*/
void br_fdb_delete_by_port(struct net_bridge *br,
const struct net_bridge_port *p,
int do_all)
{
int i;
spin_lock_bh(&br->hash_lock);
for (i = 0; i < BR_HASH_SIZE; i++) {
struct hlist_node *h, *g;
hlist_for_each_safe(h, g, &br->hash[i]) {
struct net_bridge_fdb_entry *f
= hlist_entry(h, struct net_bridge_fdb_entry, hlist);
if (f->dst != p)
continue;
if (f->is_static && !do_all)
continue;
/*
* if multiple ports all have the same device address
* then when one port is deleted, assign
* the local entry to other port
*/
if (f->is_local) {
struct net_bridge_port *op;
list_for_each_entry(op, &br->port_list, list) {
if (op != p &&
!compare_ether_addr(op->dev->dev_addr,
f->addr.addr)) {
f->dst = op;
goto skip_delete;
}
}
}
fdb_delete(br, f);
skip_delete: ;
}
}
spin_unlock_bh(&br->hash_lock);
}
/* No locking or refcounting, assumes caller has rcu_read_lock */
struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
const unsigned char *addr)
{
struct hlist_node *h;
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) {
if (!compare_ether_addr(fdb->addr.addr, addr)) {
if (unlikely(has_expired(br, fdb)))
break;
return fdb;
}
}
return NULL;
}
#if IS_ENABLED(CONFIG_ATM_LANE)
/* Interface used by ATM LANE hook to test
* if an addr is on some other bridge port */
int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
{
struct net_bridge_fdb_entry *fdb;
struct net_bridge_port *port;
int ret;
rcu_read_lock();
port = br_port_get_rcu(dev);
if (!port)
ret = 0;
else {
fdb = __br_fdb_get(port->br, addr);
ret = fdb && fdb->dst && fdb->dst->dev != dev &&
fdb->dst->state == BR_STATE_FORWARDING;
}
rcu_read_unlock();
return ret;
}
#endif /* CONFIG_ATM_LANE */
/*
* Fill buffer with forwarding table records in
* the API format.
*/
int br_fdb_fillbuf(struct net_bridge *br, void *buf,
unsigned long maxnum, unsigned long skip)
{
struct __fdb_entry *fe = buf;
int i, num = 0;
struct hlist_node *h;
struct net_bridge_fdb_entry *f;
memset(buf, 0, maxnum*sizeof(struct __fdb_entry));
rcu_read_lock();
for (i = 0; i < BR_HASH_SIZE; i++) {
hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) {
if (num >= maxnum)
goto out;
if (has_expired(br, f))
continue;
/* ignore pseudo entry for local MAC address */
if (!f->dst)
continue;
if (skip) {
--skip;
continue;
}
/* convert from internal format to API */
memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN);
/* due to ABI compat need to split into hi/lo */
fe->port_no = f->dst->port_no;
fe->port_hi = f->dst->port_no >> 8;
fe->is_local = f->is_local;
if (!f->is_static)
fe->ageing_timer_value = jiffies_to_clock_t(jiffies - f->updated);
++fe;
++num;
}
}
out:
rcu_read_unlock();
return num;
}
static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
const unsigned char *addr)
{
struct hlist_node *h;
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry(fdb, h, head, hlist) {
if (!compare_ether_addr(fdb->addr.addr, addr))
return fdb;
}
return NULL;
}
static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
const unsigned char *addr)
{
struct hlist_node *h;
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry_rcu(fdb, h, head, hlist) {
if (!compare_ether_addr(fdb->addr.addr, addr))
return fdb;
}
return NULL;
}
static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
struct net_bridge_port *source,
const unsigned char *addr)
{
struct net_bridge_fdb_entry *fdb;
fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
if (fdb) {
memcpy(fdb->addr.addr, addr, ETH_ALEN);
fdb->dst = source;
fdb->is_local = 0;
fdb->is_static = 0;
fdb->updated = fdb->used = jiffies;
hlist_add_head_rcu(&fdb->hlist, head);
}
return fdb;
}
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr)
{
struct hlist_head *head = &br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
if (!is_valid_ether_addr(addr))
return -EINVAL;
fdb = fdb_find(head, addr);
if (fdb) {
/* it is okay to have multiple ports with same
* address, just use the first one.
*/
if (fdb->is_local)
return 0;
br_warn(br, "adding interface %s with same address "
"as a received packet\n",
source->dev->name);
fdb_delete(br, fdb);
}
fdb = fdb_create(head, source, addr);
if (!fdb)
return -ENOMEM;
fdb->is_local = fdb->is_static = 1;
fdb_notify(br, fdb, RTM_NEWNEIGH);
return 0;
}
/* Add entry for local address of interface */
int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr)
{
int ret;
spin_lock_bh(&br->hash_lock);
ret = fdb_insert(br, source, addr);
spin_unlock_bh(&br->hash_lock);
return ret;
}
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr)
{
struct hlist_head *head = &br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
/* some users want to always flood. */
if (hold_time(br) == 0)
return;
/* ignore packets unless we are using this port */
if (!(source->state == BR_STATE_LEARNING ||
source->state == BR_STATE_FORWARDING))
return;
fdb = fdb_find_rcu(head, addr);
if (likely(fdb)) {
/* attempt to update an entry for a local interface */
if (unlikely(fdb->is_local)) {
if (net_ratelimit())
br_warn(br, "received packet on %s with "
"own address as source address\n",
source->dev->name);
} else {
/* fastpath: update of existing entry */
fdb->dst = source;
fdb->updated = jiffies;
}
} else {
spin_lock(&br->hash_lock);
if (likely(!fdb_find(head, addr))) {
fdb = fdb_create(head, source, addr);
if (fdb)
fdb_notify(br, fdb, RTM_NEWNEIGH);
}
/* else we lose race and someone else inserts
* it first, don't bother updating
*/
spin_unlock(&br->hash_lock);
}
}
static int fdb_to_nud(const struct net_bridge_fdb_entry *fdb)
{
if (fdb->is_local)
return NUD_PERMANENT;
else if (fdb->is_static)
return NUD_NOARP;
else if (has_expired(fdb->dst->br, fdb))
return NUD_STALE;
else
return NUD_REACHABLE;
}
static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
const struct net_bridge_fdb_entry *fdb,
u32 pid, u32 seq, int type, unsigned int flags)
{
unsigned long now = jiffies;
struct nda_cacheinfo ci;
struct nlmsghdr *nlh;
struct ndmsg *ndm;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndm = nlmsg_data(nlh);
ndm->ndm_family = AF_BRIDGE;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
ndm->ndm_flags = 0;
ndm->ndm_type = 0;
ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex;
ndm->ndm_state = fdb_to_nud(fdb);
if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr))
goto nla_put_failure;
ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
ci.ndm_confirmed = 0;
ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
ci.ndm_refcnt = 0;
if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
goto nla_put_failure;
return nlmsg_end(skb, nlh);
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static inline size_t fdb_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct ndmsg))
+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+ nla_total_size(sizeof(struct nda_cacheinfo));
}
static void fdb_notify(struct net_bridge *br,
const struct net_bridge_fdb_entry *fdb, int type)
{
struct net *net = dev_net(br->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0);
if (err < 0) {
/* -EMSGSIZE implies BUG in fdb_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
/* Dump information about entries, in response to GETNEIGH */
int br_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
int idx)
{
struct net_bridge *br = netdev_priv(dev);
int i;
if (!(dev->priv_flags & IFF_EBRIDGE))
goto out;
for (i = 0; i < BR_HASH_SIZE; i++) {
struct hlist_node *h;
struct net_bridge_fdb_entry *f;
hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) {
if (idx < cb->args[0])
goto skip;
if (fdb_fill_info(skb, br, f,
NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI) < 0)
break;
skip:
++idx;
}
}
out:
return idx;
}
/* Update (create or replace) forwarding database entry */
static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
__u16 state, __u16 flags)
{
struct net_bridge *br = source->br;
struct hlist_head *head = &br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
fdb = fdb_find(head, addr);
if (fdb == NULL) {
if (!(flags & NLM_F_CREATE))
return -ENOENT;
fdb = fdb_create(head, source, addr);
if (!fdb)
return -ENOMEM;
fdb_notify(br, fdb, RTM_NEWNEIGH);
} else {
if (flags & NLM_F_EXCL)
return -EEXIST;
}
if (fdb_to_nud(fdb) != state) {
if (state & NUD_PERMANENT)
fdb->is_local = fdb->is_static = 1;
else if (state & NUD_NOARP) {
fdb->is_local = 0;
fdb->is_static = 1;
} else
fdb->is_local = fdb->is_static = 0;
fdb->updated = fdb->used = jiffies;
fdb_notify(br, fdb, RTM_NEWNEIGH);
}
return 0;
}
/* Add new permanent fdb entry with RTM_NEWNEIGH */
int br_fdb_add(struct ndmsg *ndm, struct net_device *dev,
unsigned char *addr, u16 nlh_flags)
{
struct net_bridge_port *p;
int err = 0;
if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) {
pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
return -EINVAL;
}
p = br_port_get_rtnl(dev);
if (p == NULL) {
pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n",
dev->name);
return -EINVAL;
}
if (ndm->ndm_flags & NTF_USE) {
rcu_read_lock();
br_fdb_update(p->br, p, addr);
rcu_read_unlock();
} else {
spin_lock_bh(&p->br->hash_lock);
err = fdb_add_entry(p, addr, ndm->ndm_state, nlh_flags);
spin_unlock_bh(&p->br->hash_lock);
}
return err;
}
static int fdb_delete_by_addr(struct net_bridge_port *p, u8 *addr)
{
struct net_bridge *br = p->br;
struct hlist_head *head = &br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
fdb = fdb_find(head, addr);
if (!fdb)
return -ENOENT;
fdb_delete(p->br, fdb);
return 0;
}
/* Remove neighbor entry with RTM_DELNEIGH */
int br_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
unsigned char *addr)
{
struct net_bridge_port *p;
int err;
p = br_port_get_rtnl(dev);
if (p == NULL) {
pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n",
dev->name);
return -EINVAL;
}
spin_lock_bh(&p->br->hash_lock);
err = fdb_delete_by_addr(p, addr);
spin_unlock_bh(&p->br->hash_lock);
return err;
}