diff --git a/Makefile b/Makefile index 24980d4..00972bd 100644 --- a/Makefile +++ b/Makefile @@ -78,6 +78,8 @@ E := S := $E $E SMP ?= 1 +CFLAGS += -D SEMU_SMP=$(SMP) +CFLAGS += -D SEMU_BOOT_TARGET_TIME=10 .PHONY: riscv-harts.dtsi riscv-harts.dtsi: $(Q)python3 scripts/gen-hart-dts.py $@ $(SMP) $(CLOCK_FREQ) diff --git a/riscv.c b/riscv.c index c3fd394..bd92f1f 100644 --- a/riscv.c +++ b/riscv.c @@ -382,6 +382,14 @@ static void op_sret(hart_t *vm) vm->s_mode = vm->sstatus_spp; vm->sstatus_sie = vm->sstatus_spie; + /* After the booting process is complete, initrd will be loaded. At this + * point, the sytstem will switch to U mode for the first time. Therefore, + * by checking whether the switch to U mode has already occurred, we can + * determine if the boot process has been completed. + */ + if (!boot_complete && !vm->s_mode) + boot_complete = true; + /* Reset stack */ vm->sstatus_spp = false; vm->sstatus_spie = true; diff --git a/utils.c b/utils.c index 29f9575..ba24169 100644 --- a/utils.c +++ b/utils.c @@ -19,6 +19,9 @@ #endif #endif +bool boot_complete = false; +static double scale_factor; + /* Calculate "x * n / d" without unnecessary overflow or loss of precision. * * Reference: @@ -32,35 +35,177 @@ static inline uint64_t mult_frac(uint64_t x, uint64_t n, uint64_t d) return q * n + r * n / d; } -void semu_timer_init(semu_timer_t *timer, uint64_t freq) +/* On POSIX => use clock_gettime(). + * On macOS => use mach_absolute_time(). + * Else => fallback to time(0) in seconds, convert to ns. + * + * Now, the POSIX/macOS logic can be clearly reused. Meanwhile, the fallback + * path might just do a coarser approach with time(0). + */ +static inline uint64_t host_time_ns() { - timer->freq = freq; - semu_timer_rebase(timer, 0); +#if defined(HAVE_POSIX_TIMER) + struct timespec ts; + clock_gettime(CLOCKID, &ts); + return (uint64_t) ts.tv_sec * 1e9 + (uint64_t) ts.tv_nsec; + +#elif defined(HAVE_MACH_TIMER) + static mach_timebase_info_data_t ts = {0}; + if (ts.denom == 0) + (void) mach_timebase_info(&ts); + + uint64_t now = mach_absolute_time(); + /* convert to nanoseconds: (now * t.numer / t.denom) */ + return mult_frac(now, ts.numer, (uint64_t) ts.denom); + +#else + /* Minimal fallback: time(0) in seconds => convert to ns. */ + time_t now_sec = time(0); + return (uint64_t) now_sec * 1e9; +#endif } -static uint64_t semu_timer_clocksource(uint64_t freq) +/* Measure the overhead of a high-resolution timer call, typically + * 'clock_gettime()' on POSIX or 'mach_absolute_time()' on macOS. + * + * 1) Times how long it takes to call 'host_time_ns()' repeatedly (iterations). + * 2) Derives an average overhead per call => ns_per_call. + * 3) Because semu_timer_clocksource is ~10% of boot overhead, and called ~2e8 + * times * SMP, we get predict_sec = ns_per_call * SMP * 2. Then set + * 'scale_factor' so the entire boot completes in SEMU_BOOT_TARGET_TIME + * seconds. + */ +static void measure_bogomips_ns(uint64_t iterations) { -#if defined(HAVE_POSIX_TIMER) - struct timespec t; - clock_gettime(CLOCKID, &t); - return t.tv_sec * freq + mult_frac(t.tv_nsec, freq, 1e9); + /* Perform 'iterations' times calling the host HRT. + * + * + * Assuming the cost of loop overhead is 'e' and the cost of 'host_time_ns' + * is 't', we perform a two-stage measurement to eliminate the loop + * overhead. In the first loop, 'host_time_ns' is called only once per + * iteration, while in the second loop, it is called twice per iteration. + * + * In this way, the cost of the first loop is 'e + t', and the cost of the + * second loop is 'e + 2t'. By subtracting the two, we can effectively + * eliminate the loop overhead. + * + * Reference: + * https://ates.dev/posts/2025-01-12-accurate-benchmarking/ + */ + const uint64_t start_ns_1 = host_time_ns(); + for (uint64_t loops = 0; loops < iterations; loops++) + (void) host_time_ns(); + + const uint64_t end_ns_1 = host_time_ns(); + const uint64_t elapsed_ns_1 = end_ns_1 - start_ns_1; + + /* Second measurement */ + const uint64_t start_ns_2 = host_time_ns(); + for (uint64_t loops = 0; loops < iterations; loops++) { + (void) host_time_ns(); + (void) host_time_ns(); + } + + const uint64_t end_ns_2 = host_time_ns(); + const uint64_t elapsed_ns_2 = end_ns_2 - start_ns_2; + + /* Calculate average overhead per call */ + const double ns_per_call = + (double) (elapsed_ns_2 - elapsed_ns_1) / (double) iterations; + + /* 'semu_timer_clocksource' is called ~2e8 times per SMP. Each call's + * overhead ~ ns_per_call. The total overhead is ~ ns_per_call * SMP * 2e8. + * That overhead is about 10% of the entire boot, so effectively: + * predict_sec = ns_per_call * SMP * 2e8 * (100%/10%) / 1e9 + * = ns_per_call * SMP * 2.0 + * Then scale_factor = (desired_time) / (predict_sec). + */ + const double predict_sec = ns_per_call * SEMU_SMP * 2.0; + scale_factor = SEMU_BOOT_TARGET_TIME / predict_sec; +} + +/* The main function that returns the "emulated time" in ticks. + * + * Before the boot completes, we scale time by 'scale_factor' for a "fake + * increments" approach. After boot completes, we switch to real time + * with an offset bridging so that there's no big jump. + */ +static uint64_t semu_timer_clocksource(semu_timer_t *timer) +{ + /* After boot process complete, the timer will switch to real time. Thus, + * there is an offset between the real time and the emulator time. + * + * After switching to real time, the correct way to update time is to + * calculate the increment of time. Then add it to the emulator time. + */ + static int64_t offset = 0; + static bool first_switch = true; + +#if defined(HAVE_POSIX_TIMER) || defined(HAVE_MACH_TIMER) + uint64_t now_ns = host_time_ns(); + + /* real_ticks = (now_ns * freq) / 1e9 */ + uint64_t real_ticks = mult_frac(now_ns, timer->freq, 1e9); + + /* scaled_ticks = (now_ns * (freq*scale_factor)) / 1e9 + * = ((now_ns * freq) / 1e9) * scale_factor + */ + uint64_t scaled_ticks = real_ticks * scale_factor; + + if (!boot_complete) + return scaled_ticks; /* Return scaled ticks in the boot phase. */ + + /* The boot is done => switch to real freq with an offset bridging. */ + if (first_switch) { + first_switch = false; + offset = (int64_t) (real_ticks - scaled_ticks); + } + return (uint64_t) ((int64_t) real_ticks - offset); + #elif defined(HAVE_MACH_TIMER) - static mach_timebase_info_data_t t; - if (t.denom == 0) - (void) mach_timebase_info(&t); - return mult_frac(mult_frac(mach_absolute_time(), t.numer, t.denom), freq, - 1e9); -#else - return time(0) * freq; + /* Because we don't rely on sub-second calls to 'host_time_ns()' here, + * we directly use time(0). This means the time resolution is coarse (1 + * second), but the logic is the same: we do a scaled approach pre-boot, + * then real freq with an offset post-boot. + */ + time_t now_sec = time(0); + + /* Before boot done, scale time. */ + if (!boot_complete) + return (uint64_t) now_sec * (uint64_t) (timer->freq * scale_factor); + + if (first_switch) { + first_switch = false; + uint64_t real_val = (uint64_t) now_sec * (uint64_t) timer->freq; + uint64_t scaled_val = + (uint64_t) now_sec * (uint64_t) (timer->freq * scale_factor); + offset = (int64_t) (real_val - scaled_val); + } + + /* Return real freq minus offset. */ + uint64_t real_freq_val = (uint64_t) now_sec * (uint64_t) timer->freq; + return real_freq_val - offset; #endif } +void semu_timer_init(semu_timer_t *timer, uint64_t freq) +{ + /* Measure how long each call to 'host_time_ns()' roughly takes, + * then use that to pick 'scale_factor'. For example, pass freq + * as the loop count or some large number to get a stable measure. + */ + measure_bogomips_ns(freq); + + timer->freq = freq; + semu_timer_rebase(timer, 0); +} + uint64_t semu_timer_get(semu_timer_t *timer) { - return semu_timer_clocksource(timer->freq) - timer->begin; + return semu_timer_clocksource(timer) - timer->begin; } void semu_timer_rebase(semu_timer_t *timer, uint64_t time) { - timer->begin = semu_timer_clocksource(timer->freq) - time; + timer->begin = semu_timer_clocksource(timer) - time; } diff --git a/utils.h b/utils.h index 6e03ea0..54e5f19 100644 --- a/utils.h +++ b/utils.h @@ -1,5 +1,6 @@ #pragma once +#include #include /* TIMER */ @@ -8,6 +9,8 @@ typedef struct { uint64_t freq; } semu_timer_t; +extern bool boot_complete; /* Time to reach the first user process. */ + void semu_timer_init(semu_timer_t *timer, uint64_t freq); uint64_t semu_timer_get(semu_timer_t *timer); void semu_timer_rebase(semu_timer_t *timer, uint64_t time); \ No newline at end of file