linux/net/tipc/ib_media.c
Jon Maloy 16ad3f4022 tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.

- We introduce hard lower and upper window limits per link, still
  different and configurable per bearer type.

- We introduce a 'slow start theshold' variable, initially set to
  the maximum window size.

- We let a link start at the minimum congestion window, i.e. in slow
  start mode, and then let is grow rapidly (+1 per rceived ACK) until
  it reaches the slow start threshold and enters congestion avoidance
  mode.

- In congestion avoidance mode we increment the congestion window for
  each window-size number of acked packets, up to a possible maximum
  equal to the configured maximum window.

- For each non-duplicate NACK received, we drop back to fast recovery
  mode, by setting the both the slow start threshold to and the
  congestion window to (current_congestion_window / 2).

- If the timeout handler finds that the transmit queue has not moved
  since the previous timeout, it drops the link back to slow start
  and forces a probe containing the last sent sequence number to the
  sent to the peer, so that this can discover the stale situation.

This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.

This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.

Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 17:31:15 -08:00

105 lines
3.7 KiB
C

/*
* net/tipc/ib_media.c: Infiniband bearer support for TIPC
*
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
*
* Based on eth_media.c, which carries the following copyright notice:
*
* Copyright (c) 2001-2007, Ericsson AB
* Copyright (c) 2005-2008, 2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the names of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* Alternatively, this software may be distributed under the terms of the
* GNU General Public License ("GPL") version 2 as published by the Free
* Software Foundation.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/if_infiniband.h>
#include "core.h"
#include "bearer.h"
#define TIPC_MAX_IB_LINK_WIN 500
/* convert InfiniBand address (media address format) media address to string */
static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf,
int str_size)
{
if (str_size < 60) /* 60 = 19 * strlen("xx:") + strlen("xx\0") */
return 1;
sprintf(str_buf, "%20phC", a->value);
return 0;
}
/* Convert from media address format to discovery message addr format */
static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr)
{
memset(msg, 0, TIPC_MEDIA_INFO_SIZE);
memcpy(msg, addr->value, INFINIBAND_ALEN);
return 0;
}
/* Convert raw InfiniBand address format to media addr format */
static int tipc_ib_raw2addr(struct tipc_bearer *b,
struct tipc_media_addr *addr,
char *msg)
{
memset(addr, 0, sizeof(*addr));
memcpy(addr->value, msg, INFINIBAND_ALEN);
addr->media_id = TIPC_MEDIA_TYPE_IB;
addr->broadcast = !memcmp(msg, b->bcast_addr.value,
INFINIBAND_ALEN);
return 0;
}
/* Convert discovery msg addr format to InfiniBand media addr format */
static int tipc_ib_msg2addr(struct tipc_bearer *b,
struct tipc_media_addr *addr,
char *msg)
{
return tipc_ib_raw2addr(b, addr, msg);
}
/* InfiniBand media registration info */
struct tipc_media ib_media_info = {
.send_msg = tipc_l2_send_msg,
.enable_media = tipc_enable_l2_media,
.disable_media = tipc_disable_l2_media,
.addr2str = tipc_ib_addr2str,
.addr2msg = tipc_ib_addr2msg,
.msg2addr = tipc_ib_msg2addr,
.raw2addr = tipc_ib_raw2addr,
.priority = TIPC_DEF_LINK_PRI,
.tolerance = TIPC_DEF_LINK_TOL,
.min_win = TIPC_DEF_LINK_WIN,
.max_win = TIPC_MAX_IB_LINK_WIN,
.type_id = TIPC_MEDIA_TYPE_IB,
.hwaddr_len = INFINIBAND_ALEN,
.name = "ib"
};