watchdog/hardlockup: add a "cpu" param to watchdog_hardlockup_check()

In preparation for the buddy hardlockup detector where the CPU checking
for lockup might not be the currently running CPU, add a "cpu" parameter
to watchdog_hardlockup_check().

As part of this change, make hrtimer_interrupts an atomic_t since now the
CPU incrementing the value and the CPU reading the value might be
different.  Technially this could also be done with just READ_ONCE and
WRITE_ONCE, but atomic_t feels a little cleaner in this case.

While hrtimer_interrupts is made atomic_t, we change
hrtimer_interrupts_saved from "unsigned long" to "int".  The "int" is
needed to match the data type backing atomic_t for hrtimer_interrupts. 
Even if this changes us from 64-bits to 32-bits (which I don't think is
true for most compilers), it doesn't really matter.  All we ever do is
increment it every few seconds and compare it to an old value so 32-bits
is fine (even 16-bits would be).  The "signed" vs "unsigned" also doesn't
matter for simple equality comparisons.

hrtimer_interrupts_saved is _not_ switched to atomic_t nor even accessed
with READ_ONCE / WRITE_ONCE.  The hrtimer_interrupts_saved is always
consistently accessed with the same CPU.  NOTE: with the upcoming "buddy"
detector there is one special case.  When a CPU goes offline/online then
we can change which CPU is the one to consistently access a given instance
of hrtimer_interrupts_saved.  We still can't end up with a partially
updated hrtimer_interrupts_saved, however, because we end up petting all
affected CPUs to make sure the new and old CPU can't end up somehow
read/write hrtimer_interrupts_saved at the same time.

Link: https://lkml.kernel.org/r/20230519101840.v5.10.I3a7d4dd8c23ac30ee0b607d77feb6646b64825c0@changeid
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen-Yu Tsai <wens@csie.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Colin Cross <ccross@android.com>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Lecopzer Chen <lecopzer.chen@mediatek.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masayoshi Mizuma <msys.mizuma@gmail.com>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Stephen Boyd <swboyd@chromium.org>
Cc: Sumit Garg <sumit.garg@linaro.org>
Cc: Tzung-Bi Shih <tzungbi@chromium.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Douglas Anderson 2023-05-19 10:18:34 -07:00 committed by Andrew Morton
parent 1610611aad
commit 77c12fc959
3 changed files with 34 additions and 22 deletions

View File

@ -88,7 +88,7 @@ static inline void hardlockup_detector_disable(void) {}
#endif #endif
#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
void watchdog_hardlockup_check(struct pt_regs *regs); void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs);
#endif #endif
#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)

View File

@ -87,29 +87,34 @@ __setup("nmi_watchdog=", hardlockup_panic_setup);
#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
static unsigned long watchdog_hardlockup_all_cpu_dumped; static unsigned long watchdog_hardlockup_all_cpu_dumped;
static bool is_hardlockup(void) static bool is_hardlockup(unsigned int cpu)
{ {
unsigned long hrint = __this_cpu_read(hrtimer_interrupts); int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
return true; return true;
__this_cpu_write(hrtimer_interrupts_saved, hrint); /*
* NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE
* for hrtimer_interrupts_saved. hrtimer_interrupts_saved is
* written/read by a single CPU.
*/
per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
return false; return false;
} }
static void watchdog_hardlockup_kick(void) static void watchdog_hardlockup_kick(void)
{ {
__this_cpu_inc(hrtimer_interrupts); atomic_inc(raw_cpu_ptr(&hrtimer_interrupts));
} }
void watchdog_hardlockup_check(struct pt_regs *regs) void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
{ {
/* /*
* Check for a hardlockup by making sure the CPU's timer * Check for a hardlockup by making sure the CPU's timer
@ -117,35 +122,42 @@ void watchdog_hardlockup_check(struct pt_regs *regs)
* fired multiple times before we overflow'd. If it hasn't * fired multiple times before we overflow'd. If it hasn't
* then this is a good indication the cpu is stuck * then this is a good indication the cpu is stuck
*/ */
if (is_hardlockup()) { if (is_hardlockup(cpu)) {
unsigned int this_cpu = smp_processor_id(); unsigned int this_cpu = smp_processor_id();
struct cpumask backtrace_mask = *cpu_online_mask;
/* Only print hardlockups once. */ /* Only print hardlockups once. */
if (__this_cpu_read(watchdog_hardlockup_warned)) if (per_cpu(watchdog_hardlockup_warned, cpu))
return; return;
pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", this_cpu); pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
print_modules(); print_modules();
print_irqtrace_events(current); print_irqtrace_events(current);
if (regs) if (cpu == this_cpu) {
show_regs(regs); if (regs)
else show_regs(regs);
dump_stack(); else
dump_stack();
cpumask_clear_cpu(cpu, &backtrace_mask);
} else {
if (trigger_single_cpu_backtrace(cpu))
cpumask_clear_cpu(cpu, &backtrace_mask);
}
/* /*
* Perform all-CPU dump only once to avoid multiple hardlockups * Perform multi-CPU dump only once to avoid multiple
* generating interleaving traces * hardlockups generating interleaving traces
*/ */
if (sysctl_hardlockup_all_cpu_backtrace && if (sysctl_hardlockup_all_cpu_backtrace &&
!test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped)) !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped))
trigger_allbutself_cpu_backtrace(); trigger_cpumask_backtrace(&backtrace_mask);
if (hardlockup_panic) if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP"); nmi_panic(regs, "Hard LOCKUP");
__this_cpu_write(watchdog_hardlockup_warned, true); per_cpu(watchdog_hardlockup_warned, cpu) = true;
} else { } else {
__this_cpu_write(watchdog_hardlockup_warned, false); per_cpu(watchdog_hardlockup_warned, cpu) = false;
} }
} }

View File

@ -120,7 +120,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
return; return;
} }
watchdog_hardlockup_check(regs); watchdog_hardlockup_check(smp_processor_id(), regs);
} }
static int hardlockup_detector_event_create(void) static int hardlockup_detector_event_create(void)