mirror of
https://github.com/torvalds/linux.git
synced 2024-11-10 14:11:52 +00:00
6025b9135f
softnet_data->time_squeeze is sometimes used as a proxy for host overload or indication of scheduling problems. In practice this statistic is very noisy and has hard to grasp units - e.g. is 10 squeezes a second to be expected, or high? Delaying network (NAPI) processing leads to drops on NIC queues but also RTT bloat, impacting pacing and CA decisions. Stalls are a little hard to detect on the Rx side, because there may simply have not been any packets received in given period of time. Packet timestamps help a little bit, but again we don't know if packets are stale because we're not keeping up or because someone (*cough* cgroups) disabled IRQs for a long time. We can, however, use Tx as a proxy for Rx stalls. Most drivers use combined Rx+Tx NAPIs so if Tx gets starved so will Rx. On the Tx side we know exactly when packets get queued, and completed, so there is no uncertainty. This patch adds stall checks to BQL. Why BQL? Because it's a convenient place to add such checks, already called by most drivers, and it has copious free space in its structures (this patch adds no extra cache references or dirtying to the fast path). The algorithm takes one parameter - max delay AKA stall threshold and increments a counter whenever NAPI got delayed for at least that amount of time. It also records the length of the longest stall. To be precise every time NAPI has not polled for at least stall thrs we check if there were any Tx packets queued between last NAPI run and now - stall_thrs/2. Unlike the classic Tx watchdog this mechanism does not ignore stalls caused by Tx being disabled, or loss of link. I don't think the check is worth the complexity, and stall is a stall, whether due to host overload, flow control, link down... doesn't matter much to the application. We have been running this detector in production at Meta for 2 years, with the threshold of 8ms. It's the lowest value where false positives become rare. There's still a constant stream of reported stalls (especially without the ksoftirqd deferral patches reverted), those who like their stall metrics to be 0 may prefer higher value. Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Breno Leitao <leitao@debian.org> Signed-off-by: David S. Miller <davem@davemloft.net>
213 lines
6.3 KiB
C
213 lines
6.3 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Dynamic byte queue limits. See include/linux/dynamic_queue_limits.h
|
|
*
|
|
* Copyright (c) 2011, Tom Herbert <therbert@google.com>
|
|
*/
|
|
#include <linux/types.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/jiffies.h>
|
|
#include <linux/dynamic_queue_limits.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/export.h>
|
|
#include <trace/events/napi.h>
|
|
|
|
#define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
|
|
#define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)
|
|
|
|
static void dql_check_stall(struct dql *dql)
|
|
{
|
|
unsigned short stall_thrs;
|
|
unsigned long now;
|
|
|
|
stall_thrs = READ_ONCE(dql->stall_thrs);
|
|
if (!stall_thrs)
|
|
return;
|
|
|
|
now = jiffies;
|
|
/* Check for a potential stall */
|
|
if (time_after_eq(now, dql->last_reap + stall_thrs)) {
|
|
unsigned long hist_head, t, start, end;
|
|
|
|
/* We are trying to detect a period of at least @stall_thrs
|
|
* jiffies without any Tx completions, but during first half
|
|
* of which some Tx was posted.
|
|
*/
|
|
dqs_again:
|
|
hist_head = READ_ONCE(dql->history_head);
|
|
/* pairs with smp_wmb() in dql_queued() */
|
|
smp_rmb();
|
|
|
|
/* Get the previous entry in the ring buffer, which is the
|
|
* oldest sample.
|
|
*/
|
|
start = (hist_head - DQL_HIST_LEN + 1) * BITS_PER_LONG;
|
|
|
|
/* Advance start to continue from the last reap time */
|
|
if (time_before(start, dql->last_reap + 1))
|
|
start = dql->last_reap + 1;
|
|
|
|
/* Newest sample we should have already seen a completion for */
|
|
end = hist_head * BITS_PER_LONG + (BITS_PER_LONG - 1);
|
|
|
|
/* Shrink the search space to [start, (now - start_thrs/2)] if
|
|
* `end` is beyond the stall zone
|
|
*/
|
|
if (time_before(now, end + stall_thrs / 2))
|
|
end = now - stall_thrs / 2;
|
|
|
|
/* Search for the queued time in [t, end] */
|
|
for (t = start; time_before_eq(t, end); t++)
|
|
if (test_bit(t % (DQL_HIST_LEN * BITS_PER_LONG),
|
|
dql->history))
|
|
break;
|
|
|
|
/* Variable t contains the time of the queue */
|
|
if (!time_before_eq(t, end))
|
|
goto no_stall;
|
|
|
|
/* The ring buffer was modified in the meantime, retry */
|
|
if (hist_head != READ_ONCE(dql->history_head))
|
|
goto dqs_again;
|
|
|
|
dql->stall_cnt++;
|
|
dql->stall_max = max_t(unsigned short, dql->stall_max, now - t);
|
|
|
|
trace_dql_stall_detected(dql->stall_thrs, now - t,
|
|
dql->last_reap, dql->history_head,
|
|
now, dql->history);
|
|
}
|
|
no_stall:
|
|
dql->last_reap = now;
|
|
}
|
|
|
|
/* Records completed count and recalculates the queue limit */
|
|
void dql_completed(struct dql *dql, unsigned int count)
|
|
{
|
|
unsigned int inprogress, prev_inprogress, limit;
|
|
unsigned int ovlimit, completed, num_queued;
|
|
bool all_prev_completed;
|
|
|
|
num_queued = READ_ONCE(dql->num_queued);
|
|
|
|
/* Can't complete more than what's in queue */
|
|
BUG_ON(count > num_queued - dql->num_completed);
|
|
|
|
completed = dql->num_completed + count;
|
|
limit = dql->limit;
|
|
ovlimit = POSDIFF(num_queued - dql->num_completed, limit);
|
|
inprogress = num_queued - completed;
|
|
prev_inprogress = dql->prev_num_queued - dql->num_completed;
|
|
all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued);
|
|
|
|
if ((ovlimit && !inprogress) ||
|
|
(dql->prev_ovlimit && all_prev_completed)) {
|
|
/*
|
|
* Queue considered starved if:
|
|
* - The queue was over-limit in the last interval,
|
|
* and there is no more data in the queue.
|
|
* OR
|
|
* - The queue was over-limit in the previous interval and
|
|
* when enqueuing it was possible that all queued data
|
|
* had been consumed. This covers the case when queue
|
|
* may have becomes starved between completion processing
|
|
* running and next time enqueue was scheduled.
|
|
*
|
|
* When queue is starved increase the limit by the amount
|
|
* of bytes both sent and completed in the last interval,
|
|
* plus any previous over-limit.
|
|
*/
|
|
limit += POSDIFF(completed, dql->prev_num_queued) +
|
|
dql->prev_ovlimit;
|
|
dql->slack_start_time = jiffies;
|
|
dql->lowest_slack = UINT_MAX;
|
|
} else if (inprogress && prev_inprogress && !all_prev_completed) {
|
|
/*
|
|
* Queue was not starved, check if the limit can be decreased.
|
|
* A decrease is only considered if the queue has been busy in
|
|
* the whole interval (the check above).
|
|
*
|
|
* If there is slack, the amount of excess data queued above
|
|
* the amount needed to prevent starvation, the queue limit
|
|
* can be decreased. To avoid hysteresis we consider the
|
|
* minimum amount of slack found over several iterations of the
|
|
* completion routine.
|
|
*/
|
|
unsigned int slack, slack_last_objs;
|
|
|
|
/*
|
|
* Slack is the maximum of
|
|
* - The queue limit plus previous over-limit minus twice
|
|
* the number of objects completed. Note that two times
|
|
* number of completed bytes is a basis for an upper bound
|
|
* of the limit.
|
|
* - Portion of objects in the last queuing operation that
|
|
* was not part of non-zero previous over-limit. That is
|
|
* "round down" by non-overlimit portion of the last
|
|
* queueing operation.
|
|
*/
|
|
slack = POSDIFF(limit + dql->prev_ovlimit,
|
|
2 * (completed - dql->num_completed));
|
|
slack_last_objs = dql->prev_ovlimit ?
|
|
POSDIFF(dql->prev_last_obj_cnt, dql->prev_ovlimit) : 0;
|
|
|
|
slack = max(slack, slack_last_objs);
|
|
|
|
if (slack < dql->lowest_slack)
|
|
dql->lowest_slack = slack;
|
|
|
|
if (time_after(jiffies,
|
|
dql->slack_start_time + dql->slack_hold_time)) {
|
|
limit = POSDIFF(limit, dql->lowest_slack);
|
|
dql->slack_start_time = jiffies;
|
|
dql->lowest_slack = UINT_MAX;
|
|
}
|
|
}
|
|
|
|
/* Enforce bounds on limit */
|
|
limit = clamp(limit, dql->min_limit, dql->max_limit);
|
|
|
|
if (limit != dql->limit) {
|
|
dql->limit = limit;
|
|
ovlimit = 0;
|
|
}
|
|
|
|
dql->adj_limit = limit + completed;
|
|
dql->prev_ovlimit = ovlimit;
|
|
dql->prev_last_obj_cnt = dql->last_obj_cnt;
|
|
dql->num_completed = completed;
|
|
dql->prev_num_queued = num_queued;
|
|
|
|
dql_check_stall(dql);
|
|
}
|
|
EXPORT_SYMBOL(dql_completed);
|
|
|
|
void dql_reset(struct dql *dql)
|
|
{
|
|
/* Reset all dynamic values */
|
|
dql->limit = 0;
|
|
dql->num_queued = 0;
|
|
dql->num_completed = 0;
|
|
dql->last_obj_cnt = 0;
|
|
dql->prev_num_queued = 0;
|
|
dql->prev_last_obj_cnt = 0;
|
|
dql->prev_ovlimit = 0;
|
|
dql->lowest_slack = UINT_MAX;
|
|
dql->slack_start_time = jiffies;
|
|
|
|
dql->last_reap = jiffies;
|
|
dql->history_head = jiffies / BITS_PER_LONG;
|
|
memset(dql->history, 0, sizeof(dql->history));
|
|
}
|
|
EXPORT_SYMBOL(dql_reset);
|
|
|
|
void dql_init(struct dql *dql, unsigned int hold_time)
|
|
{
|
|
dql->max_limit = DQL_MAX_LIMIT;
|
|
dql->min_limit = 0;
|
|
dql->slack_hold_time = hold_time;
|
|
dql->stall_thrs = 0;
|
|
dql_reset(dql);
|
|
}
|
|
EXPORT_SYMBOL(dql_init);
|