forked from Minki/linux
35c55c9877
TIPC based clusters are by default set up with full-mesh link connectivity between all nodes. Those links are expected to provide a short failure detection time, by default set to 1500 ms. Because of this, the background load for neighbor monitoring in an N-node cluster increases with a factor N on each node, while the overall monitoring traffic through the network infrastructure increases at a ~(N * (N - 1)) rate. Experience has shown that such clusters don't scale well beyond ~100 nodes unless we significantly increase failure discovery tolerance. This commit introduces a framework and an algorithm that drastically reduces this background load, while basically maintaining the original failure detection times across the whole cluster. Using this algorithm, background load will now grow at a rate of ~(2 * sqrt(N)) per node, and at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will now have to actively monitor 38 neighbors in a 400-node cluster, instead of as before 399. This "Overlapping Ring Supervision Algorithm" is completely distributed and employs no centralized or coordinated state. It goes as follows: - Each node makes up a linearly ascending, circular list of all its N known neighbors, based on their TIPC node identity. This algorithm must be the same on all nodes. - The node then selects the next M = sqrt(N) - 1 nodes downstream from itself in the list, and chooses to actively monitor those. This is called its "local monitoring domain". - It creates a domain record describing the monitoring domain, and piggy-backs this in the data area of all neighbor monitoring messages (LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in the cluster eventually (default within 400 ms) will learn about its monitoring domain. - Whenever a node discovers a change in its local domain, e.g., a node has been added or has gone down, it creates and sends out a new version of its node record to inform all neighbors about the change. - A node receiving a domain record from anybody outside its local domain matches this against its own list (which may not look the same), and chooses to not actively monitor those members of the received domain record that are also present in its own list. Instead, it relies on indications from the direct monitoring nodes if an indirectly monitored node has gone up or down. If a node is indicated lost, the receiving node temporarily activates its own direct monitoring towards that node in order to confirm, or not, that it is actually gone. - Since each node is actively monitoring sqrt(N) downstream neighbors, each node is also actively monitored by the same number of upstream neighbors. This means that all non-direct monitoring nodes normally will receive sqrt(N) indications that a node is gone. - A major drawback with ring monitoring is how it handles failures that cause massive network partitionings. If both a lost node and all its direct monitoring neighbors are inside the lost partition, the nodes in the remaining partition will never receive indications about the loss. To overcome this, each node also chooses to actively monitor some nodes outside its local domain. Those nodes are called remote domain "heads", and are selected in such a way that no node in the cluster will be more than two direct monitoring hops away. Because of this, each node, apart from monitoring the member of its local domain, will also typically monitor sqrt(N) remote head nodes. - As an optimization, local list status, domain status and domain records are marked with a generation number. This saves senders from unnecessarily conveying unaltered domain records, and receivers from performing unneeded re-adaptations of their node monitoring list, such as re-assigning domain heads. - As a measure of caution we have added the possibility to disable the new algorithm through configuration. We do this by keeping a threshold value for the cluster size; a cluster that grows beyond this value will switch from full-mesh to ring monitoring, and vice versa when it shrinks below the value. This means that if the threshold is set to a value larger than any anticipated cluster size (default size is 32) the new algorithm is effectively disabled. A patch set for altering the threshold value and for listing the table contents will follow shortly. - This change is fully backwards compatible. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
173 lines
4.6 KiB
C
173 lines
4.6 KiB
C
/*
|
|
* net/tipc/core.h: Include file for TIPC global declarations
|
|
*
|
|
* Copyright (c) 2005-2006, 2013 Ericsson AB
|
|
* Copyright (c) 2005-2007, 2010-2013, Wind River Systems
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the names of the copyright holders nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* Alternatively, this software may be distributed under the terms of the
|
|
* GNU General Public License ("GPL") version 2 as published by the Free
|
|
* Software Foundation.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef _TIPC_CORE_H
|
|
#define _TIPC_CORE_H
|
|
|
|
#include <linux/tipc.h>
|
|
#include <linux/tipc_config.h>
|
|
#include <linux/tipc_netlink.h>
|
|
#include <linux/types.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/timer.h>
|
|
#include <linux/string.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/atomic.h>
|
|
#include <asm/hardirq.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/in.h>
|
|
#include <linux/list.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/rtnetlink.h>
|
|
#include <linux/etherdevice.h>
|
|
#include <net/netns/generic.h>
|
|
#include <linux/rhashtable.h>
|
|
|
|
struct tipc_node;
|
|
struct tipc_bearer;
|
|
struct tipc_bc_base;
|
|
struct tipc_link;
|
|
struct tipc_name_table;
|
|
struct tipc_server;
|
|
struct tipc_monitor;
|
|
|
|
#define TIPC_MOD_VER "2.0.0"
|
|
|
|
#define NODE_HTABLE_SIZE 512
|
|
#define MAX_BEARERS 3
|
|
#define TIPC_DEF_MON_THRESHOLD 32
|
|
|
|
extern int tipc_net_id __read_mostly;
|
|
extern int sysctl_tipc_rmem[3] __read_mostly;
|
|
extern int sysctl_tipc_named_timeout __read_mostly;
|
|
|
|
struct tipc_net {
|
|
u32 own_addr;
|
|
int net_id;
|
|
int random;
|
|
|
|
/* Node table and node list */
|
|
spinlock_t node_list_lock;
|
|
struct hlist_head node_htable[NODE_HTABLE_SIZE];
|
|
struct list_head node_list;
|
|
u32 num_nodes;
|
|
u32 num_links;
|
|
|
|
/* Neighbor monitoring list */
|
|
struct tipc_monitor *monitors[MAX_BEARERS];
|
|
int mon_threshold;
|
|
|
|
/* Bearer list */
|
|
struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1];
|
|
|
|
/* Broadcast link */
|
|
spinlock_t bclock;
|
|
struct tipc_bc_base *bcbase;
|
|
struct tipc_link *bcl;
|
|
|
|
/* Socket hash table */
|
|
struct rhashtable sk_rht;
|
|
|
|
/* Name table */
|
|
spinlock_t nametbl_lock;
|
|
struct name_table *nametbl;
|
|
|
|
/* Name dist queue */
|
|
struct list_head dist_queue;
|
|
|
|
/* Topology subscription server */
|
|
struct tipc_server *topsrv;
|
|
atomic_t subscription_count;
|
|
};
|
|
|
|
static inline struct tipc_net *tipc_net(struct net *net)
|
|
{
|
|
return net_generic(net, tipc_net_id);
|
|
}
|
|
|
|
static inline int tipc_netid(struct net *net)
|
|
{
|
|
return tipc_net(net)->net_id;
|
|
}
|
|
|
|
static inline struct list_head *tipc_nodes(struct net *net)
|
|
{
|
|
return &tipc_net(net)->node_list;
|
|
}
|
|
|
|
static inline unsigned int tipc_hashfn(u32 addr)
|
|
{
|
|
return addr & (NODE_HTABLE_SIZE - 1);
|
|
}
|
|
|
|
static inline u16 mod(u16 x)
|
|
{
|
|
return x & 0xffffu;
|
|
}
|
|
|
|
static inline int less_eq(u16 left, u16 right)
|
|
{
|
|
return mod(right - left) < 32768u;
|
|
}
|
|
|
|
static inline int more(u16 left, u16 right)
|
|
{
|
|
return !less_eq(left, right);
|
|
}
|
|
|
|
static inline int less(u16 left, u16 right)
|
|
{
|
|
return less_eq(left, right) && (mod(right) != mod(left));
|
|
}
|
|
|
|
static inline int in_range(u16 val, u16 min, u16 max)
|
|
{
|
|
return !less(val, min) && !more(val, max);
|
|
}
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
int tipc_register_sysctl(void);
|
|
void tipc_unregister_sysctl(void);
|
|
#else
|
|
#define tipc_register_sysctl() 0
|
|
#define tipc_unregister_sysctl()
|
|
#endif
|
|
#endif
|