forked from Minki/linux
a934d4d15f
Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker <frederic@kernel.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Cc: David S . Miller <davem@davemloft.net> Cc: Lai Jiangshan <jiangshanlai@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Tejun Heo <tj@kernel.org> Link: http://lkml.kernel.org/r/1509980490-4285-10-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
370 lines
9.9 KiB
C
370 lines
9.9 KiB
C
/*
|
|
* linux/kernel/irq/timings.c
|
|
*
|
|
* Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/static_key.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/idr.h>
|
|
#include <linux/irq.h>
|
|
#include <linux/math64.h>
|
|
|
|
#include <trace/events/irq.h>
|
|
|
|
#include "internals.h"
|
|
|
|
DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
|
|
|
|
DEFINE_PER_CPU(struct irq_timings, irq_timings);
|
|
|
|
struct irqt_stat {
|
|
u64 next_evt;
|
|
u64 last_ts;
|
|
u64 variance;
|
|
u32 avg;
|
|
u32 nr_samples;
|
|
int anomalies;
|
|
int valid;
|
|
};
|
|
|
|
static DEFINE_IDR(irqt_stats);
|
|
|
|
void irq_timings_enable(void)
|
|
{
|
|
static_branch_enable(&irq_timing_enabled);
|
|
}
|
|
|
|
void irq_timings_disable(void)
|
|
{
|
|
static_branch_disable(&irq_timing_enabled);
|
|
}
|
|
|
|
/**
|
|
* irqs_update - update the irq timing statistics with a new timestamp
|
|
*
|
|
* @irqs: an irqt_stat struct pointer
|
|
* @ts: the new timestamp
|
|
*
|
|
* The statistics are computed online, in other words, the code is
|
|
* designed to compute the statistics on a stream of values rather
|
|
* than doing multiple passes on the values to compute the average,
|
|
* then the variance. The integer division introduces a loss of
|
|
* precision but with an acceptable error margin regarding the results
|
|
* we would have with the double floating precision: we are dealing
|
|
* with nanosec, so big numbers, consequently the mantisse is
|
|
* negligeable, especially when converting the time in usec
|
|
* afterwards.
|
|
*
|
|
* The computation happens at idle time. When the CPU is not idle, the
|
|
* interrupts' timestamps are stored in the circular buffer, when the
|
|
* CPU goes idle and this routine is called, all the buffer's values
|
|
* are injected in the statistical model continuying to extend the
|
|
* statistics from the previous busy-idle cycle.
|
|
*
|
|
* The observations showed a device will trigger a burst of periodic
|
|
* interrupts followed by one or two peaks of longer time, for
|
|
* instance when a SD card device flushes its cache, then the periodic
|
|
* intervals occur again. A one second inactivity period resets the
|
|
* stats, that gives us the certitude the statistical values won't
|
|
* exceed 1x10^9, thus the computation won't overflow.
|
|
*
|
|
* Basically, the purpose of the algorithm is to watch the periodic
|
|
* interrupts and eliminate the peaks.
|
|
*
|
|
* An interrupt is considered periodically stable if the interval of
|
|
* its occurences follow the normal distribution, thus the values
|
|
* comply with:
|
|
*
|
|
* avg - 3 x stddev < value < avg + 3 x stddev
|
|
*
|
|
* Which can be simplified to:
|
|
*
|
|
* -3 x stddev < value - avg < 3 x stddev
|
|
*
|
|
* abs(value - avg) < 3 x stddev
|
|
*
|
|
* In order to save a costly square root computation, we use the
|
|
* variance. For the record, stddev = sqrt(variance). The equation
|
|
* above becomes:
|
|
*
|
|
* abs(value - avg) < 3 x sqrt(variance)
|
|
*
|
|
* And finally we square it:
|
|
*
|
|
* (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
|
|
*
|
|
* (value - avg) x (value - avg) < 9 x variance
|
|
*
|
|
* Statistically speaking, any values out of this interval is
|
|
* considered as an anomaly and is discarded. However, a normal
|
|
* distribution appears when the number of samples is 30 (it is the
|
|
* rule of thumb in statistics, cf. "30 samples" on Internet). When
|
|
* there are three consecutive anomalies, the statistics are resetted.
|
|
*
|
|
*/
|
|
static void irqs_update(struct irqt_stat *irqs, u64 ts)
|
|
{
|
|
u64 old_ts = irqs->last_ts;
|
|
u64 variance = 0;
|
|
u64 interval;
|
|
s64 diff;
|
|
|
|
/*
|
|
* The timestamps are absolute time values, we need to compute
|
|
* the timing interval between two interrupts.
|
|
*/
|
|
irqs->last_ts = ts;
|
|
|
|
/*
|
|
* The interval type is u64 in order to deal with the same
|
|
* type in our computation, that prevent mindfuck issues with
|
|
* overflow, sign and division.
|
|
*/
|
|
interval = ts - old_ts;
|
|
|
|
/*
|
|
* The interrupt triggered more than one second apart, that
|
|
* ends the sequence as predictible for our purpose. In this
|
|
* case, assume we have the beginning of a sequence and the
|
|
* timestamp is the first value. As it is impossible to
|
|
* predict anything at this point, return.
|
|
*
|
|
* Note the first timestamp of the sequence will always fall
|
|
* in this test because the old_ts is zero. That is what we
|
|
* want as we need another timestamp to compute an interval.
|
|
*/
|
|
if (interval >= NSEC_PER_SEC) {
|
|
memset(irqs, 0, sizeof(*irqs));
|
|
irqs->last_ts = ts;
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Pre-compute the delta with the average as the result is
|
|
* used several times in this function.
|
|
*/
|
|
diff = interval - irqs->avg;
|
|
|
|
/*
|
|
* Increment the number of samples.
|
|
*/
|
|
irqs->nr_samples++;
|
|
|
|
/*
|
|
* Online variance divided by the number of elements if there
|
|
* is more than one sample. Normally the formula is division
|
|
* by nr_samples - 1 but we assume the number of element will be
|
|
* more than 32 and dividing by 32 instead of 31 is enough
|
|
* precise.
|
|
*/
|
|
if (likely(irqs->nr_samples > 1))
|
|
variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
|
|
|
|
/*
|
|
* The rule of thumb in statistics for the normal distribution
|
|
* is having at least 30 samples in order to have the model to
|
|
* apply. Values outside the interval are considered as an
|
|
* anomaly.
|
|
*/
|
|
if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
|
|
/*
|
|
* After three consecutive anomalies, we reset the
|
|
* stats as it is no longer stable enough.
|
|
*/
|
|
if (irqs->anomalies++ >= 3) {
|
|
memset(irqs, 0, sizeof(*irqs));
|
|
irqs->last_ts = ts;
|
|
return;
|
|
}
|
|
} else {
|
|
/*
|
|
* The anomalies must be consecutives, so at this
|
|
* point, we reset the anomalies counter.
|
|
*/
|
|
irqs->anomalies = 0;
|
|
}
|
|
|
|
/*
|
|
* The interrupt is considered stable enough to try to predict
|
|
* the next event on it.
|
|
*/
|
|
irqs->valid = 1;
|
|
|
|
/*
|
|
* Online average algorithm:
|
|
*
|
|
* new_average = average + ((value - average) / count)
|
|
*
|
|
* The variance computation depends on the new average
|
|
* to be computed here first.
|
|
*
|
|
*/
|
|
irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
|
|
|
|
/*
|
|
* Online variance algorithm:
|
|
*
|
|
* new_variance = variance + (value - average) x (value - new_average)
|
|
*
|
|
* Warning: irqs->avg is updated with the line above, hence
|
|
* 'interval - irqs->avg' is no longer equal to 'diff'
|
|
*/
|
|
irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
|
|
|
|
/*
|
|
* Update the next event
|
|
*/
|
|
irqs->next_evt = ts + irqs->avg;
|
|
}
|
|
|
|
/**
|
|
* irq_timings_next_event - Return when the next event is supposed to arrive
|
|
*
|
|
* During the last busy cycle, the number of interrupts is incremented
|
|
* and stored in the irq_timings structure. This information is
|
|
* necessary to:
|
|
*
|
|
* - know if the index in the table wrapped up:
|
|
*
|
|
* If more than the array size interrupts happened during the
|
|
* last busy/idle cycle, the index wrapped up and we have to
|
|
* begin with the next element in the array which is the last one
|
|
* in the sequence, otherwise it is a the index 0.
|
|
*
|
|
* - have an indication of the interrupts activity on this CPU
|
|
* (eg. irq/sec)
|
|
*
|
|
* The values are 'consumed' after inserting in the statistical model,
|
|
* thus the count is reinitialized.
|
|
*
|
|
* The array of values **must** be browsed in the time direction, the
|
|
* timestamp must increase between an element and the next one.
|
|
*
|
|
* Returns a nanosec time based estimation of the earliest interrupt,
|
|
* U64_MAX otherwise.
|
|
*/
|
|
u64 irq_timings_next_event(u64 now)
|
|
{
|
|
struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
|
|
struct irqt_stat *irqs;
|
|
struct irqt_stat __percpu *s;
|
|
u64 ts, next_evt = U64_MAX;
|
|
int i, irq = 0;
|
|
|
|
/*
|
|
* This function must be called with the local irq disabled in
|
|
* order to prevent the timings circular buffer to be updated
|
|
* while we are reading it.
|
|
*/
|
|
lockdep_assert_irqs_disabled();
|
|
|
|
/*
|
|
* Number of elements in the circular buffer: If it happens it
|
|
* was flushed before, then the number of elements could be
|
|
* smaller than IRQ_TIMINGS_SIZE, so the count is used,
|
|
* otherwise the array size is used as we wrapped. The index
|
|
* begins from zero when we did not wrap. That could be done
|
|
* in a nicer way with the proper circular array structure
|
|
* type but with the cost of extra computation in the
|
|
* interrupt handler hot path. We choose efficiency.
|
|
*
|
|
* Inject measured irq/timestamp to the statistical model
|
|
* while decrementing the counter because we consume the data
|
|
* from our circular buffer.
|
|
*/
|
|
for (i = irqts->count & IRQ_TIMINGS_MASK,
|
|
irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
|
|
irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
|
|
|
|
irq = irq_timing_decode(irqts->values[i], &ts);
|
|
|
|
s = idr_find(&irqt_stats, irq);
|
|
if (s) {
|
|
irqs = this_cpu_ptr(s);
|
|
irqs_update(irqs, ts);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Look in the list of interrupts' statistics, the earliest
|
|
* next event.
|
|
*/
|
|
idr_for_each_entry(&irqt_stats, s, i) {
|
|
|
|
irqs = this_cpu_ptr(s);
|
|
|
|
if (!irqs->valid)
|
|
continue;
|
|
|
|
if (irqs->next_evt <= now) {
|
|
irq = i;
|
|
next_evt = now;
|
|
|
|
/*
|
|
* This interrupt mustn't use in the future
|
|
* until new events occur and update the
|
|
* statistics.
|
|
*/
|
|
irqs->valid = 0;
|
|
break;
|
|
}
|
|
|
|
if (irqs->next_evt < next_evt) {
|
|
irq = i;
|
|
next_evt = irqs->next_evt;
|
|
}
|
|
}
|
|
|
|
return next_evt;
|
|
}
|
|
|
|
void irq_timings_free(int irq)
|
|
{
|
|
struct irqt_stat __percpu *s;
|
|
|
|
s = idr_find(&irqt_stats, irq);
|
|
if (s) {
|
|
free_percpu(s);
|
|
idr_remove(&irqt_stats, irq);
|
|
}
|
|
}
|
|
|
|
int irq_timings_alloc(int irq)
|
|
{
|
|
struct irqt_stat __percpu *s;
|
|
int id;
|
|
|
|
/*
|
|
* Some platforms can have the same private interrupt per cpu,
|
|
* so this function may be be called several times with the
|
|
* same interrupt number. Just bail out in case the per cpu
|
|
* stat structure is already allocated.
|
|
*/
|
|
s = idr_find(&irqt_stats, irq);
|
|
if (s)
|
|
return 0;
|
|
|
|
s = alloc_percpu(*s);
|
|
if (!s)
|
|
return -ENOMEM;
|
|
|
|
idr_preload(GFP_KERNEL);
|
|
id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
|
|
idr_preload_end();
|
|
|
|
if (id < 0) {
|
|
free_percpu(s);
|
|
return id;
|
|
}
|
|
|
|
return 0;
|
|
}
|