Skip to content

Commit

Permalink
kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup
Browse files Browse the repository at this point in the history
In many cases of hardlockup reports, it's actually not possible to know
why it triggered, because the CPU that got stuck is usually waiting on a
resource (with IRQs disabled) in posession of some other CPU is holding.

IOW, we are often looking at the stacktrace of the victim and not the
actual offender.

Introduce sysctl / cmdline parameter that makes it possible to have
hardlockup detector perform all-CPU backtrace.

Signed-off-by: Jiri Kosina <[email protected]>
Reviewed-by: Aaron Tomlin <[email protected]>
Cc: Ulrich Obergfell <[email protected]>
Acked-by: Don Zickus <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Jiri Kosina authored and torvalds committed Nov 6, 2015
1 parent ee7fed5 commit 5553787
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 5 deletions.
5 changes: 5 additions & 0 deletions Documentation/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1269,6 +1269,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
Default: 1024

hardlockup_all_cpu_backtrace=
[KNL] Should the hard-lockup detector generate
backtraces on all cpus.
Format: <integer>

hashdist= [KNL,NUMA] Large hashes allocated during boot
are distributed across NUMA nodes. Defaults on
for 64-bit NUMA, off otherwise.
Expand Down
12 changes: 12 additions & 0 deletions Documentation/sysctl/kernel.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
- domainname
- hostname
- hotplug
- hardlockup_all_cpu_backtrace
- hung_task_panic
- hung_task_check_count
- hung_task_timeout_secs
Expand Down Expand Up @@ -292,6 +293,17 @@ Information Service) or YP (Yellow Pages) domainname. These two
domain names are in general different. For a detailed discussion
see the hostname(1) man page.

==============================================================
hardlockup_all_cpu_backtrace:

This value controls the hard lockup detector behavior when a hard
lockup condition is detected as to whether or not to gather further
debug information. If enabled, arch-specific all-CPU stack dumping
will be initiated.

0: do nothing. This is the default behavior.

1: on detection capture more debug information.
==============================================================

hotplug:
Expand Down
1 change: 1 addition & 0 deletions include/linux/nmi.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
extern int watchdog_thresh;
extern unsigned long *watchdog_cpumask_bits;
extern int sysctl_softlockup_all_cpu_backtrace;
extern int sysctl_hardlockup_all_cpu_backtrace;
struct ctl_table;
extern int proc_watchdog(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
Expand Down
9 changes: 9 additions & 0 deletions kernel/sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -898,6 +898,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
{
.procname = "hardlockup_all_cpu_backtrace",
.data = &sysctl_hardlockup_all_cpu_backtrace,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
#endif /* CONFIG_SMP */
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
Expand Down
33 changes: 28 additions & 5 deletions kernel/watchdog.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;

#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
Expand Down Expand Up @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn;
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static int hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
static unsigned long hardlockup_allcpu_dumped;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
Expand Down Expand Up @@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
static int __init hardlockup_all_cpu_backtrace_setup(char *str)
{
sysctl_hardlockup_all_cpu_backtrace =
!!simple_strtol(str, NULL, 0);
return 1;
}
__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
#endif

/*
Expand Down Expand Up @@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
*/
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
struct pt_regs *regs = get_irq_regs();

/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
return;

if (hardlockup_panic)
panic("Watchdog detected hard LOCKUP on cpu %d",
this_cpu);
pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
print_modules();
print_irqtrace_events(current);
if (regs)
show_regs(regs);
else
WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
this_cpu);
dump_stack();

/*
* Perform all-CPU dump only once to avoid multiple hardlockups
* generating interleaving traces
*/
if (sysctl_hardlockup_all_cpu_backtrace &&
!test_and_set_bit(0, &hardlockup_allcpu_dumped))
trigger_allbutself_cpu_backtrace();

if (hardlockup_panic)
panic("Hard LOCKUP");

__this_cpu_write(hard_watchdog_warn, true);
return;
Expand Down

0 comments on commit 5553787

Please sign in to comment.