From 90ada0f71364c0c858098713f68927e35a2c005c Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Fri, 12 Aug 2022 08:46:16 +0200 Subject: [PATCH] cmd/tetragon-vmtests-run: buffer qemu output We 've been seeing RCU stalls such as, when running qemu in GH: Running test pkg.sensors.test.TestSensorLseekLoad .[ 116.892213] rcu: INFO: rcu_sched self-detected stall on CPU [ 116.892213] rcu: 0-...!: (20987 ticks this GP) idle=d3e/1/0x4000000000000002 softirq=23120/23120 fqs=0 [ 116.892213] (t=21004 jiffies g=49257 q=8) [ 116.892213] rcu: rcu_sched kthread starved for 21004 jiffies! g49257 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0 [ 116.892213] rcu: RCU grace-period kthread stack dump: [ 116.892213] rcu_sched R running task 14920 11 2 0x90004000 [ 116.892213] Call Trace: [ 116.892213] __schedule+0x288/0x600 [ 116.892213] ? __mod_timer+0x1a6/0x3c0 [ 116.892213] schedule+0x34/0xa0 [ 116.892213] schedule_timeout+0x84/0x140 [ 116.892213] ? __next_timer_interrupt+0xc0/0xc0 [ 116.892213] rcu_gp_kthread+0x4f6/0xd40 [ 116.892213] ? kfree_call_rcu+0x10/0x10 [ 116.892213] kthread+0x107/0x120 [ 116.892213] ? __kthread_bind_mask+0x60/0x60 [ 116.892213] ret_from_fork+0x35/0x40 [ 116.892213] NMI backtrace for cpu 0 [ 116.892213] CPU: 0 PID: 413 Comm: pkg.sensors.tes Not tainted 5.4.209 #1 [ 116.892213] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 116.892213] Call Trace: [ 116.892213] [ 116.892213] dump_stack+0x50/0x63 [ 116.892213] nmi_cpu_backtrace.cold+0x13/0x50 [ 116.892213] ? lapic_can_unplug_cpu+0x60/0x60 [ 116.892213] nmi_trigger_cpumask_backtrace+0x7c/0x90 [ 116.892213] rcu_dump_cpu_stacks+0x7c/0xaa [ 116.892213] rcu_sched_clock_irq.cold+0x1b3/0x39e [ 116.892213] ? can_stop_idle_tick+0x70/0x70 [ 116.892213] update_process_times+0x56/0x90 [ 116.892213] tick_sched_handle+0x2f/0x40 [ 116.892213] tick_sched_timer+0x4b/0xb0 [ 116.892213] __hrtimer_run_queues+0x127/0x2a0 [ 116.892213] hrtimer_interrupt+0xf0/0x280 [ 116.892213] smp_apic_timer_interrupt+0x5d/0x120 [ 116.892213] apic_timer_interrupt+0xf/0x20 [ 116.892213] ... repeted until timout ... From reading https://www.kernel.org/doc/Documentation/RCU/stallwarn.txt, one of my theories is that writes to the console get delayed and the kernel enters some weird livelock state. This patch buffers qemu output aiming to avoid hitting RCU stalls such as the one above. Signed-off-by: Kornilios Kourtis --- cmd/tetragon-vmtests-run/run_tests.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cmd/tetragon-vmtests-run/run_tests.go b/cmd/tetragon-vmtests-run/run_tests.go index dbcc3f0feaa..c65502f57a1 100644 --- a/cmd/tetragon-vmtests-run/run_tests.go +++ b/cmd/tetragon-vmtests-run/run_tests.go @@ -1,6 +1,7 @@ package main import ( + "bufio" "context" "encoding/json" "fmt" @@ -28,11 +29,17 @@ func runTests( ctx, cancel := signal.NotifyContext(ctx, unix.SIGINT, unix.SIGTERM) defer cancel() qemuCmd := exec.CommandContext(ctx, qemuBin, qemuArgs...) - qemuCmd.Stdout = os.Stdout - qemuCmd.Stderr = os.Stderr + + // buffer output from qemu's stdout/stderr to avoid delays + bout := bufio.NewWriter(os.Stdout) + berr := bufio.NewWriter(os.Stderr) + qemuCmd.Stdout = bout + qemuCmd.Stderr = berr if err := qemuCmd.Run(); err != nil { return nil, err } + bout.Flush() + berr.Flush() fmt.Printf("results directory: %s\n", rcnf.testerConf.ResultsDir) resFile := filepath.Join(rcnf.testerConf.ResultsDir, "results.json")