Skip to content

Commit 1f9c6a8

Browse files
committed
Scale frequency to suppress RCU CPU stall warning
Since the emulator currently operates using sequential emulation, the execution time for the boot process is relatively long, which can result in the generation of RCU CPU stall warnings. To address this issue, there are several potential solutions: 1. Scale the frequency to slow down emulator time during the boot process, thereby eliminating RCU CPU stall warnings. 2. During the boot process, avoid using 'clock_gettime' to update ticks and instead manage the tick increment relationship manually. 3. Implement multi-threaded emulation to accelerate the emulator's execution speed. For the third point, while implementing multi-threaded emulation can significantly accelerate the emulator's execution speed, it cannot guarantee that this issue will not reappear as the number of cores increases in the future. Therefore, a better approach is to use methods 1 and 2 to allow the emulator to set an expected time for completing the boot process. The advantages and disadvantages of the scale method are as follows: Advantages: - Simple implementation - Effectively sets the expected boot process completion time - Results have strong interpretability - Emulator time can be easily mapped back to real time Disadvantages: - Slower execution speed The advantages and disadvantages of the increment ticks method are as follows: Advantages: - Faster execution speed - Effectively sets the expected boot process completion time Disadvantages: - More complex implementation - Some results are difficult to interpret - Emulator time is difficult to map back to real time Based on practical tests, the second method provides limited acceleration but introduces some significant drawbacks, such as difficulty in interpreting results and the complexity of managing the increment relationship. Therefore, this commit opts for the scale frequency method to address this issue. This commit divides time into emulator time and real time. During the boot process, the timer uses scale frequency to slow down the growth of emulator time, eliminating RCU CPU stall warnings. After the boot process is complete, the growth of emulator time aligns with real time. To configure the scale frequency parameter, three pieces of information are required: 1. The expected completion time of the boot process 2. A reference point for estimating the boot process completion time 3. The relationship between the reference point and the number of SMPs According to Using RCU’s CPU Stall Detector [1], the grace period for RCU CPU stalls is typically set to 21 seconds. By dividing this value by two as the expected completion time, we can provide a sufficient buffer to reduce the impact of errors and avoid RCU CPU stall warnings. Using 'gprof' for basic statistical analysis, it was found that 'semu_timer_clocksource' accounts for approximately 10% of the boot process execution time. Since the logic within 'semu_timer_clocksource' is relatively simple, its execution time can be assumed to be nearly equal to 'clock_gettime'. Furthermore, by adding a counter to 'semu_timer_clocksource', it was observed that each time the number of SMPs increases by 1, the execution count of 'semu_timer_clocksource' increases by approximately '2 * 10^8' With this information, we can estimate the boot process completion time as 'sec_per_call * SMPs * 2 * 10^8 * (100% / 10%)' seconds, and thereby calculate the scale frequency parameter. For instance, if the estimated time is 200 seconds and the target time is 10 seconds, the scaling factor would be '10 / 200'. Close #51 [1] https://docs.kernel.org/RCU/stallwarn.html#config-rcu-cpu-stall-timeout
1 parent 36fc1b2 commit 1f9c6a8

File tree

5 files changed

+195
-19
lines changed

5 files changed

+195
-19
lines changed

Diff for: Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ E :=
7878
S := $E $E
7979

8080
SMP ?= 1
81+
CFLAGS += -D SEMU_BOOT_TARGET_TIME=10
8182
.PHONY: riscv-harts.dtsi
8283
riscv-harts.dtsi:
8384
$(Q)python3 scripts/gen-hart-dts.py $@ $(SMP) $(CLOCK_FREQ)

Diff for: main.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,7 @@ static int semu_start(int argc, char **argv)
619619
emu.disk = virtio_blk_init(&(emu.vblk), disk_file);
620620
#endif
621621
/* Set up ACLINT */
622-
semu_timer_init(&emu.mtimer.mtime, CLOCK_FREQ);
622+
semu_timer_init(&emu.mtimer.mtime, CLOCK_FREQ, hart_count);
623623
emu.mtimer.mtimecmp = calloc(vm.n_hart, sizeof(uint64_t));
624624
emu.mswi.msip = calloc(vm.n_hart, sizeof(uint32_t));
625625
emu.sswi.ssip = calloc(vm.n_hart, sizeof(uint32_t));

Diff for: riscv.c

+8
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,14 @@ static void op_sret(hart_t *vm)
382382
vm->s_mode = vm->sstatus_spp;
383383
vm->sstatus_sie = vm->sstatus_spie;
384384

385+
/* After the booting process is complete, initrd will be loaded. At this
386+
* point, the sytstem will switch to U mode for the first time. Therefore,
387+
* by checking whether the switch to U mode has already occurred, we can
388+
* determine if the boot process has been completed.
389+
*/
390+
if (!boot_complete && !vm->s_mode)
391+
boot_complete = true;
392+
385393
/* Reset stack */
386394
vm->sstatus_spp = false;
387395
vm->sstatus_spie = true;

Diff for: utils.c

+173-17
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
#endif
2020
#endif
2121

22+
bool boot_complete = false;
23+
static double scale_factor;
24+
2225
/* Calculate "x * n / d" without unnecessary overflow or loss of precision.
2326
*
2427
* Reference:
@@ -32,35 +35,188 @@ static inline uint64_t mult_frac(uint64_t x, uint64_t n, uint64_t d)
3235
return q * n + r * n / d;
3336
}
3437

35-
void semu_timer_init(semu_timer_t *timer, uint64_t freq)
38+
/* High-precision time measurement:
39+
* - POSIX systems: clock_gettime() for nanosecond precision
40+
* - macOS: mach_absolute_time() with timebase conversion
41+
* - Other platforms: time(0) with conversion to nanoseconds as fallback
42+
*
43+
* The platform-specific timing logic is now clearly separated: POSIX and macOS
44+
* implementations provide high-precision measurements, while the fallback path
45+
* uses time(0) for a coarser but portable approach.
46+
*/
47+
static inline uint64_t host_time_ns()
3648
{
37-
timer->freq = freq;
38-
semu_timer_rebase(timer, 0);
49+
#if defined(HAVE_POSIX_TIMER)
50+
struct timespec ts;
51+
clock_gettime(CLOCKID, &ts);
52+
return (uint64_t) ts.tv_sec * 1e9 + (uint64_t) ts.tv_nsec;
53+
54+
#elif defined(HAVE_MACH_TIMER)
55+
static mach_timebase_info_data_t ts = {0};
56+
if (ts.denom == 0)
57+
(void) mach_timebase_info(&ts);
58+
59+
uint64_t now = mach_absolute_time();
60+
/* convert to nanoseconds: (now * t.numer / t.denom) */
61+
return mult_frac(now, ts.numer, (uint64_t) ts.denom);
62+
63+
#else
64+
/* Fallback to non-HRT calls time(0) in seconds => convert to ns. */
65+
time_t now_sec = time(0);
66+
return (uint64_t) now_sec * 1e9;
67+
#endif
3968
}
4069

41-
static uint64_t semu_timer_clocksource(uint64_t freq)
70+
/* BogoMips is a rough measurement of CPU speed, typically calculated by
71+
* executing a counting loop to estimate the CPU's performance.
72+
*
73+
* This function apply BogoMips method to measure the overhead of a
74+
* high-resolution timer call, typically `clock_gettime()` on POSIX or
75+
* `mach_absolute_time()` on macOS.
76+
*
77+
* 1) Times how long it takes to call `host_time_ns()` repeatedly for a given
78+
* number of iterations. This is used to derive an average overhead per call
79+
* (`ns_per_call`).
80+
* 2) Eliminates loop overhead by performing two measurements:
81+
* - In the first measurement, `host_time_ns()` is called once per iteration.
82+
* - In the second measurement, `host_time_ns()` is called twice per
83+
* iteration.
84+
* By subtracting the two results, the loop overhead is effectively canceled.
85+
* 3) Predicts the total time spent in `semu_timer_clocksource` during the boot
86+
* process based on the measured overhead per call and the number of calls
87+
* made (~2e8 times * SMP). This allows scaling the emulator clock to meet
88+
* the target boot time (`SEMU_BOOT_TARGET_TIME`).
89+
*/
90+
static void measure_bogomips_ns(uint64_t iterations, int n_harts)
4291
{
43-
#if defined(HAVE_POSIX_TIMER)
44-
struct timespec t;
45-
clock_gettime(CLOCKID, &t);
46-
return t.tv_sec * freq + mult_frac(t.tv_nsec, freq, 1e9);
92+
/* Perform 'iterations' times calling the host HRT.
93+
*
94+
* Assuming the cost of loop overhead is 'e' and the cost of 'host_time_ns'
95+
* is 't', we perform a two-stage measurement to eliminate the loop
96+
* overhead. In the first loop, 'host_time_ns' is called only once per
97+
* iteration, while in the second loop, it is called twice per iteration.
98+
*
99+
* In this way, the cost of the first loop is 'e + t', and the cost of the
100+
* second loop is 'e + 2t'. By subtracting the two, we can effectively
101+
* eliminate the loop overhead.
102+
*
103+
* Reference:
104+
* https://ates.dev/posts/2025-01-12-accurate-benchmarking/
105+
*/
106+
const uint64_t start1_ns = host_time_ns();
107+
for (uint64_t loops = 0; loops < iterations; loops++)
108+
(void) host_time_ns();
109+
110+
const uint64_t end1_ns = host_time_ns();
111+
const uint64_t elapsed1_ns = end1_ns - start1_ns;
112+
113+
/* Second measurement */
114+
const uint64_t start2_ns = host_time_ns();
115+
for (uint64_t loops = 0; loops < iterations; loops++) {
116+
(void) host_time_ns();
117+
(void) host_time_ns();
118+
}
119+
120+
const uint64_t end2_ns = host_time_ns();
121+
const uint64_t elapsed2_ns = end2_ns - start2_ns;
122+
123+
/* Calculate average overhead per call */
124+
const double ns_per_call =
125+
(double) (elapsed2_ns - elapsed1_ns) / (double) iterations;
126+
127+
/* 'semu_timer_clocksource' is called ~2e8 times per SMP. Each call's
128+
* overhead ~ ns_per_call. The total overhead is ~ ns_per_call * SMP *
129+
* 2e8. That overhead is about 10~40% of the entire boot, we take the
130+
* minimum here to get more fault tolerance. Thus, effectively:
131+
* predict_sec = ns_per_call * SMP * 2e8 * (100%/10%) / 1e9
132+
* = ns_per_call * SMP * 2.0
133+
* Then scale_factor = (desired_time) / (predict_sec).
134+
*/
135+
const double predict_sec = ns_per_call * n_harts * 2.0;
136+
scale_factor = SEMU_BOOT_TARGET_TIME / predict_sec;
137+
}
138+
139+
/* The function that returns the "emulated time" in ticks.
140+
*
141+
* Before the boot completes, we scale time by 'scale_factor' for a "fake
142+
* increments" approach. After boot completes, we switch to real time
143+
* with an offset bridging so that there's no big jump.
144+
*/
145+
static uint64_t semu_timer_clocksource(semu_timer_t *timer)
146+
{
147+
/* After boot process complete, the timer will switch to real time. Thus,
148+
* there is an offset between the real time and the emulator time.
149+
*
150+
* After switching to real time, the correct way to update time is to
151+
* calculate the increment of time. Then add it to the emulator time.
152+
*/
153+
static int64_t offset = 0;
154+
static bool first_switch = true;
155+
156+
#if defined(HAVE_POSIX_TIMER) || defined(HAVE_MACH_TIMER)
157+
uint64_t now_ns = host_time_ns();
158+
159+
/* real_ticks = (now_ns * freq) / 1e9 */
160+
uint64_t real_ticks = mult_frac(now_ns, timer->freq, 1e9);
161+
162+
/* scaled_ticks = (now_ns * (freq*scale_factor)) / 1e9
163+
* = ((now_ns * freq) / 1e9) * scale_factor
164+
*/
165+
uint64_t scaled_ticks = real_ticks * scale_factor;
166+
167+
if (!boot_complete)
168+
return scaled_ticks; /* Return scaled ticks in the boot phase. */
169+
170+
/* The boot is done => switch to real freq with an offset bridging. */
171+
if (first_switch) {
172+
first_switch = false;
173+
offset = (int64_t) (real_ticks - scaled_ticks);
174+
}
175+
return (uint64_t) ((int64_t) real_ticks - offset);
176+
47177
#elif defined(HAVE_MACH_TIMER)
48-
static mach_timebase_info_data_t t;
49-
if (t.denom == 0)
50-
(void) mach_timebase_info(&t);
51-
return mult_frac(mult_frac(mach_absolute_time(), t.numer, t.denom), freq,
52-
1e9);
53-
#else
54-
return time(0) * freq;
178+
/* Because we don't rely on sub-second calls to 'host_time_ns()' here,
179+
* we directly use time(0). This means the time resolution is coarse (1
180+
* second), but the logic is the same: we do a scaled approach pre-boot,
181+
* then real freq with an offset post-boot.
182+
*/
183+
time_t now_sec = time(0);
184+
185+
/* Before boot done, scale time. */
186+
if (!boot_complete)
187+
return (uint64_t) now_sec * (uint64_t) (timer->freq * scale_factor);
188+
189+
if (first_switch) {
190+
first_switch = false;
191+
uint64_t real_val = (uint64_t) now_sec * (uint64_t) timer->freq;
192+
uint64_t scaled_val =
193+
(uint64_t) now_sec * (uint64_t) (timer->freq * scale_factor);
194+
offset = (int64_t) (real_val - scaled_val);
195+
}
196+
197+
/* Return real freq minus offset. */
198+
uint64_t real_freq_val = (uint64_t) now_sec * (uint64_t) timer->freq;
199+
return real_freq_val - offset;
55200
#endif
56201
}
57202

203+
void semu_timer_init(semu_timer_t *timer, uint64_t freq, int n_harts)
204+
{
205+
/* Measure how long each call to 'host_time_ns()' roughly takes,
206+
* then use that to pick 'scale_factor'. For example, pass freq
207+
* as the loop count or some large number to get a stable measure.
208+
*/
209+
measure_bogomips_ns(freq, n_harts);
210+
timer->freq = freq;
211+
semu_timer_rebase(timer, 0);
212+
}
213+
58214
uint64_t semu_timer_get(semu_timer_t *timer)
59215
{
60-
return semu_timer_clocksource(timer->freq) - timer->begin;
216+
return semu_timer_clocksource(timer) - timer->begin;
61217
}
62218

63219
void semu_timer_rebase(semu_timer_t *timer, uint64_t time)
64220
{
65-
timer->begin = semu_timer_clocksource(timer->freq) - time;
221+
timer->begin = semu_timer_clocksource(timer) - time;
66222
}

Diff for: utils.h

+12-1
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,24 @@
11
#pragma once
22

3+
#include <stdbool.h>
34
#include <stdint.h>
45

6+
/* To suppress RCU CPU stall warnings, the emulator provides a scaled time to
7+
* the Guest OS during the boot process. After the boot process is complete, the
8+
* scaling is disabled to achieve a real-time timer.
9+
*
10+
* Since the Guest OS transitions to U mode for the first time when it loads the
11+
* initial user-mode process, we use this transition to determine whether the
12+
* boot process has completed.
13+
*/
14+
extern bool boot_complete;
15+
516
/* TIMER */
617
typedef struct {
718
uint64_t begin;
819
uint64_t freq;
920
} semu_timer_t;
1021

11-
void semu_timer_init(semu_timer_t *timer, uint64_t freq);
22+
void semu_timer_init(semu_timer_t *timer, uint64_t freq, int n_harts);
1223
uint64_t semu_timer_get(semu_timer_t *timer);
1324
void semu_timer_rebase(semu_timer_t *timer, uint64_t time);

0 commit comments

Comments
 (0)