From 9e94aa49eed3591e7c53531af0a9722a32a2a6d4 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Thu, 4 Dec 2025 03:54:27 +0000 Subject: [PATCH 1/9] threads: Implement asymmetric atomic fences Asymmetric atomic fences are a performance optimization of regular atomic fences (the seq_cst version of which we expose as `Base.Threads.atomic_fence`). The problem with these regular fences is that they require a CPU fence instruction, which can be very expensive and is thus unsuitable for code in the hot path. Asymmetric fences on the other hand split an ordinary fence into two: A `light` side where the fence is extremely cheap (only a compiler reordering barrier) and a `heavy` side where the fence is very expensive. Basically the way it works is that the heavy side does a system call that issues an inter-processor-interrupt (IPI) which then issues the appropriate barrier instruction on the other CPU (i.e. both CPUs will have issues a barrier instruction, one of them just does it asynchronously due to interrupt). The `light` and `heavy` naming here is taken from C++ PR1202R5 [1], which is the proposal for the same feature in the C++ standard library (to appear in the next iteration of the C++ concurrency spec). On the julia side, these functions are exposed as `Threads.atomic_fence_light` and `Threads.atomic_fence_heavy`. The light side lowers to `fence singlethread` in llvm IR (the Core.Intrinsic atomic_fence is adjusted appropriately to faciliate this). The heavy side has OS-specifc implementations, where: 1. Linux/FreeBSD try to use the `membarrier` syscall or a fallback to `mprotect` for systems that don't have it. 2. Windows uses the `FlushProcessWriteBuffers` syscall. 3. macOS uses an implementation from the dotnet runtime (https://github.com/dotnet/runtime/pull/44670), which the dotnet folks have checked with Apple does the right thing by happenstance (i.e. an IPI/memory barrier is needed to execute the syscall), but looks a little nonsensical by itself. However, since it's what Apple recommended to dotnet, I don't see much risk here, though I wouldn't be surprised if Apple added a proper syscall for this in the future (since freebsd has it now). Note that unlike the C++ spec, I have specified that `atomic_fence_heavy` does synchronize with `atomic_fence`. This matches the underlying system call. I suspect C++ chose to omit this for a hypothetical future architecture that has instruction support for doing this from userspace that would then not synchronize with ordinary barriers, but I think I would rather cross that bridge when we get there. I intend to use this in #60281, but it's an independently useful feature. [1] https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1202r5.pdf --- Compiler/src/tfuncs.jl | 4 +- base/asyncevent.jl | 2 +- base/atomics.jl | 26 +++++++- src/ast.c | 2 + src/intrinsics.cpp | 15 ++++- src/intrinsics.h | 2 +- src/jl_exported_funcs.inc | 1 + src/julia_internal.h | 4 +- src/runtime_intrinsics.c | 9 ++- src/signals-mach.c | 52 ++++++++++++++++ src/signals-unix.c | 125 ++++++++++++++++++++++++++++++++++++++ src/signals-win.c | 4 ++ test/intrinsics.jl | 12 ++-- test/threads_exec.jl | 48 +++++++++++++++ 14 files changed, 290 insertions(+), 16 deletions(-) diff --git a/Compiler/src/tfuncs.jl b/Compiler/src/tfuncs.jl index 3c23974c88920..2656c7bb1ee1a 100644 --- a/Compiler/src/tfuncs.jl +++ b/Compiler/src/tfuncs.jl @@ -716,7 +716,7 @@ end @nospecs function pointerset_tfunc(๐•ƒ::AbstractLattice, a, v, i, align) return a end -@nospecs function atomic_fence_tfunc(๐•ƒ::AbstractLattice, order) +@nospecs function atomic_fence_tfunc(๐•ƒ::AbstractLattice, order, syncscope) return Nothing end @nospecs function atomic_pointerref_tfunc(๐•ƒ::AbstractLattice, a, order) @@ -757,7 +757,7 @@ add_tfunc(add_ptr, 2, 2, pointerarith_tfunc, 1) add_tfunc(sub_ptr, 2, 2, pointerarith_tfunc, 1) add_tfunc(pointerref, 3, 3, pointerref_tfunc, 4) add_tfunc(pointerset, 4, 4, pointerset_tfunc, 5) -add_tfunc(atomic_fence, 1, 1, atomic_fence_tfunc, 4) +add_tfunc(atomic_fence, 2, 2, atomic_fence_tfunc, 4) add_tfunc(atomic_pointerref, 2, 2, atomic_pointerref_tfunc, 4) add_tfunc(atomic_pointerset, 3, 3, atomic_pointerset_tfunc, 5) add_tfunc(atomic_pointerswap, 3, 3, atomic_pointerswap_tfunc, 5) diff --git a/base/asyncevent.jl b/base/asyncevent.jl index a4a82b4aba120..68ae27049adc0 100644 --- a/base/asyncevent.jl +++ b/base/asyncevent.jl @@ -165,7 +165,7 @@ function _trywait(t::Union{Timer, AsyncCondition}) set = t.set if set # full barrier now for AsyncCondition - t isa Timer || Core.Intrinsics.atomic_fence(:acquire_release) + t isa Timer || Core.Intrinsics.atomic_fence(:acquire_release, :system) else if !isopen(t) set = t.set diff --git a/base/atomics.jl b/base/atomics.jl index 432c9120939ac..155cb38d6d03b 100644 --- a/base/atomics.jl +++ b/base/atomics.jl @@ -329,4 +329,28 @@ fences should not be necessary in most cases. For further details, see LLVM's `fence` instruction. """ -atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent) +atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :system) + +""" + Threads.atomic_fence_light() + +This is a read-optimized sequential-consistency memory fence. +On supported operating systems and architectures, this fence is cheaper +than `Threads.atomic_fence()`, but synchronizes only with +[`atomic_fence_heavy`](@ref) calls from other threads. +""" +atomic_fence_light() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :singlethread) + +""" + Threads.atomic_fence_heavy() + +This is a write-optimized sequential-consistency memory fence. +This fence is significantly more expensive than `Threads.atomic_fence`. +It generally requires a system call and a full interprocessor interrupt +to all other processors in the system. It synchronizes with both +[`atomic_fence_light`](@ref) and [`atomic_fence`](@ref) calls from other threads. + +For further details, see the Linux `membarrier` syscall or the Windows +`FlushProcessWriteBuffers` API. +""" +atomic_fence_heavy() = ccall(:jl_membarrier, Cvoid, ()) diff --git a/src/ast.c b/src/ast.c index d6e3893751c9f..f513ef34b01eb 100644 --- a/src/ast.c +++ b/src/ast.c @@ -319,6 +319,8 @@ void jl_init_common_symbols(void) jl_atomic_sym = jl_symbol("atomic"); jl_not_atomic_sym = jl_symbol("not_atomic"); jl_unordered_sym = jl_symbol("unordered"); + jl_singlethread_sym = jl_symbol("singlethread"); + jl_system_sym = jl_symbol("system"); jl_monotonic_sym = jl_symbol("monotonic"); jl_acquire_sym = jl_symbol("acquire"); jl_release_sym = jl_symbol("release"); diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index ae25c3cc83ca5..4791d439e2ad0 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -915,6 +915,15 @@ static jl_cgval_t emit_pointerarith(jl_codectx_t &ctx, intrinsic f, static jl_cgval_t emit_atomicfence(jl_codectx_t &ctx, ArrayRef argv) { const jl_cgval_t &ord = argv[0]; + const jl_cgval_t &ssid_arg = argv[1]; + llvm::SyncScope::ID ssid = llvm::SyncScope::System; + if (!ssid_arg.constant || !jl_is_symbol(ssid_arg.constant) || + ((jl_sym_t*)ssid_arg.constant != jl_singlethread_sym && + (jl_sym_t*)ssid_arg.constant != jl_system_sym)) { + return emit_runtime_call(ctx, atomic_fence, argv, 2); + } + if ((jl_sym_t*)ssid_arg.constant == jl_singlethread_sym) + ssid = llvm::SyncScope::SingleThread; if (ord.constant && jl_is_symbol(ord.constant)) { enum jl_memory_order order = jl_get_atomic_order((jl_sym_t*)ord.constant, true, true); if (order == jl_memory_order_invalid) { @@ -922,10 +931,10 @@ static jl_cgval_t emit_atomicfence(jl_codectx_t &ctx, ArrayRef argv) return jl_cgval_t(); // unreachable } if (order > jl_memory_order_monotonic) - ctx.builder.CreateFence(get_llvm_atomic_order(order)); + ctx.builder.CreateFence(get_llvm_atomic_order(order), ssid); return ghostValue(ctx, jl_nothing_type); } - return emit_runtime_call(ctx, atomic_fence, argv, 1); + return emit_runtime_call(ctx, atomic_fence, argv, 2); } static jl_cgval_t emit_atomic_pointerref(jl_codectx_t &ctx, ArrayRef argv) @@ -1339,7 +1348,7 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar case atomic_fence: ++Emitted_atomic_fence; - assert(nargs == 1); + assert(nargs == 2); return emit_atomicfence(ctx, argv); case atomic_pointerref: ++Emitted_atomic_pointerref; diff --git a/src/intrinsics.h b/src/intrinsics.h index 5765e3e671bc6..4fd5630afa38f 100644 --- a/src/intrinsics.h +++ b/src/intrinsics.h @@ -95,7 +95,7 @@ ADD_I(pointerref, 3) \ ADD_I(pointerset, 4) \ /* pointer atomics */ \ - ADD_I(atomic_fence, 1) \ + ADD_I(atomic_fence, 2) \ ADD_I(atomic_pointerref, 2) \ ADD_I(atomic_pointerset, 3) \ ADD_I(atomic_pointerswap, 3) \ diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index d0cfb16062b7d..1a0ff14a82b55 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -506,6 +506,7 @@ XX(jl_vprintf) \ XX(jl_wakeup_thread) \ XX(jl_write_compiler_output) \ + XX(jl_membarrier) \ #define JL_RUNTIME_EXPORTED_FUNCS_WIN(XX) \ XX(jl_setjmp) \ diff --git a/src/julia_internal.h b/src/julia_internal.h index 94036981f950a..9a85d1086d694 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -1689,7 +1689,7 @@ STATIC_INLINE int is_valid_intrinsic_elptr(jl_value_t *ety) JL_DLLEXPORT jl_value_t *jl_bitcast(jl_value_t *ty, jl_value_t *v); JL_DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i, jl_value_t *align); JL_DLLEXPORT jl_value_t *jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *align, jl_value_t *i); -JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order); +JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order, jl_value_t *syncscope); JL_DLLEXPORT jl_value_t *jl_atomic_pointerref(jl_value_t *p, jl_value_t *order); JL_DLLEXPORT jl_value_t *jl_atomic_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *order); JL_DLLEXPORT jl_value_t *jl_atomic_pointerswap(jl_value_t *p, jl_value_t *x, jl_value_t *order); @@ -2010,6 +2010,8 @@ JL_DLLEXPORT int jl_isabspath(const char *in) JL_NOTSAFEPOINT; XX(uninferred_sym) \ XX(unordered_sym) \ XX(unused_sym) \ + XX(singlethread_sym) \ + XX(system_sym) #define XX(name) extern JL_DLLEXPORT jl_sym_t *jl_##name; JL_COMMON_SYMBOLS(XX) diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index 31dd3e085033c..d03a97e93d14e 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -622,9 +622,16 @@ JL_DLLEXPORT jl_value_t *jl_atomic_pointerreplace(jl_value_t *p, jl_value_t *exp return result; } -JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym) +JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym, jl_value_t *syncscope_sym) { JL_TYPECHK(fence, symbol, order_sym); + JL_TYPECHK(fence, symbol, syncscope_sym); + if ((jl_sym_t*)syncscope_sym == jl_singlethread_sym) { + asm volatile ("" : : : "memory"); + return jl_nothing; + } else if ((jl_sym_t*)syncscope_sym != jl_system_sym) { + jl_error("atomic_fence: invalid syncscope"); + } enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1); if (order > jl_memory_order_monotonic) jl_fence(); diff --git a/src/signals-mach.c b/src/signals-mach.c index 1ef3e9d23094a..e138ca01f90a0 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -891,3 +892,54 @@ JL_DLLEXPORT void jl_profile_stop_timer(void) profile_all_tasks = 0; uv_mutex_unlock(&bt_data_prof_lock); } + +// The mprotect implementation in signals-unix.c does not work on macOS/aarch64, as mentioned. +// This implementation comes from dotnet, but is similarly dependent on undocumented behavior of the OS. +// Copyright (c) .NET Foundation and Contributors +// MIT LICENSE +JL_DLLEXPORT void jl_membarrier(void) { + mach_msg_type_number_t cThreads; + thread_act_t *pThreads; + kern_return_t machret = task_threads(mach_task_self(), &pThreads, &cThreads); + HANDLE_MACH_ERROR("task_threads()", machret); + + uintptr_t sp; + uintptr_t registerValues[128]; + + // Iterate through each of the threads in the list. + for (mach_msg_type_number_t i = 0; i < cThreads; i++) + { + if (__builtin_available (macOS 10.14, iOS 12, tvOS 9, *)) + { + // Request the threads pointer values to force the thread to emit a memory barrier + size_t registers = 128; + machret = thread_get_register_pointer_values(pThreads[i], &sp, ®isters, registerValues); + } + else + { + // fallback implementation for older OS versions +#if defined(_CPU_X86_64_) + x86_thread_state64_t threadState; + mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT; + machret = thread_get_state(pThreads[i], x86_THREAD_STATE64, (thread_state_t)&threadState, &count); +#elif defined(_CPU_AARCH64_) + arm_thread_state64_t threadState; + mach_msg_type_number_t count = ARM_THREAD_STATE64_COUNT; + machret = thread_get_state(pThreads[i], ARM_THREAD_STATE64, (thread_state_t)&threadState, &count); +#else + #error Unexpected architecture +#endif + } + + if (machret == KERN_INSUFFICIENT_BUFFER_SIZE) + { + HANDLE_MACH_ERROR("thread_get_register_pointer_values()", machret); + } + + machret = mach_port_deallocate(mach_task_self(), pThreads[i]); + HANDLE_MACH_ERROR("mach_port_deallocate()", machret); + } + // Deallocate the thread list now we're done with it. + machret = vm_deallocate(mach_task_self(), (vm_address_t)pThreads, cThreads * sizeof(thread_act_t)); + HANDLE_MACH_ERROR("vm_deallocate()", machret); +} diff --git a/src/signals-unix.c b/src/signals-unix.c index 16e70ef0f764e..12f49b42d5df6 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -1274,3 +1274,128 @@ JL_DLLEXPORT int jl_repl_raise_sigtstp(void) { return raise(SIGTSTP); } + +// Linux and FreeBSD have compatible membarrier support +#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_) +#if defined(_OS_LINUX_) +# include +# if defined(__has_include) +# if __has_include() +# include +# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__) +# else +# if defined(__NR_membarrier) +enum membarrier_cmd { + MEMBARRIER_CMD_QUERY = 0, + MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), +}; +# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__) +# else +# warning "Missing linux kernel headers for membarrier syscall, support disabled" +# define membarrier(...) -ENOSYS +# endif +# endif +# else +# include +# endif +#elif defined(_OS_FREEBSD_) +# include +# if __FreeBSD_version >= 1401500 +# include +# else +# define MEMBARRIER_CMD_QUERY 0x00 +# define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 +# define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10 +# define membarrier(...) -ENOSYS +# endif +#endif + +// Implementation of the `mprotect` based membarrier fallback. +// This is a common fallback based on the observation that `mprotect` happens to +// issue the necessary memory barriers. However, there is no spec that +// guarantees this behavior, and indeed AArch64 macos does not. However, we +// only use it as a fallback here for older versions of Linux and FreeBSD where +// we know that it happens to work. +static pthread_mutex_t mprotect_barrier_lock = PTHREAD_MUTEX_INITIALIZER; +static _Atomic(uint64_t) *mprotect_barrier_page = NULL; +static void jl_init_mprotect_membarrier(void) +{ + int result = pthread_mutex_lock(&mprotect_barrier_lock); + assert(result == 0); + if (mprotect_barrier_page == NULL) { + size_t pagesize = jl_getpagesize(); + + mprotect_barrier_page = (_Atomic(uint64_t) *) + mmap(NULL, pagesize, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mprotect_barrier_page == MAP_FAILED) { + jl_safe_printf("fatal: failed to allocate barrier page.\n"); + abort(); + } + result = mlock(mprotect_barrier_page, pagesize); + if (result != 0) { + jl_safe_printf("fatal: failed to mlock barrier page.\n"); + abort(); + } + } + result = pthread_mutex_unlock(&mprotect_barrier_lock); + assert(result == 0); + (void)result; +} + +static void jl_mprotect_membarrier(void) +{ + int result = pthread_mutex_lock(&mprotect_barrier_lock); + assert(result == 0); + size_t pagesize = jl_getpagesize(); + result = mprotect(mprotect_barrier_page, pagesize, PROT_NONE); + jl_atomic_fetch_add_relaxed(mprotect_barrier_page, 1); + assert(result == 0); + result = mprotect(mprotect_barrier_page, pagesize, PROT_READ | PROT_WRITE); + assert(result == 0); + result = pthread_mutex_unlock(&mprotect_barrier_lock); + assert(result == 0); + (void)result; +} + +// Implementation of `jl_membarrier` +enum membarrier_implementation { + MEMBARRIER_IMPLEMENTATION_UNKNOWN = 0, + MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER = 1, + MEMBARRIER_IMPLEMENTATION_MPROTECT = 2 +}; + +static _Atomic(enum membarrier_implementation) membarrier_impl = MEMBARRIER_IMPLEMENTATION_UNKNOWN; + +static enum membarrier_implementation jl_init_membarrier(void) { + int ret = membarrier(MEMBARRIER_CMD_QUERY, 0); + int needed = MEMBARRIER_CMD_PRIVATE_EXPEDITED | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED; + if (ret > 0 && ((ret & needed) == needed)) { + // supported + if (membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0) == 0) { + // working + jl_atomic_store_relaxed(&membarrier_impl, MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER); + return MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER; + } + } + jl_init_mprotect_membarrier(); + jl_atomic_store_relaxed(&membarrier_impl, MEMBARRIER_IMPLEMENTATION_MPROTECT); + return MEMBARRIER_IMPLEMENTATION_MPROTECT; +} + +JL_DLLEXPORT void jl_membarrier(void) { + enum membarrier_implementation impl = jl_atomic_load_relaxed(&membarrier_impl); + if (impl == MEMBARRIER_IMPLEMENTATION_UNKNOWN) { + impl = jl_init_membarrier(); + } + if (impl == MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER) { + int ret = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0); + assert(ret); + (void)ret; + } else { + assert(impl == MEMBARRIER_IMPLEMENTATION_MPROTECT); + jl_mprotect_membarrier(); + } +} +#endif diff --git a/src/signals-win.c b/src/signals-win.c index 1702bc338b16f..e0968b35e8555 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -664,3 +664,7 @@ void jl_install_thread_signal_handler(jl_ptls_t ptls) have_backtrace_fiber = 1; } } + +JL_DLLEXPORT void jl_membarrier(void) { + FlushProcessWriteBuffers(); +} diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 5e18c1fb3672a..03ba86f85676e 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -395,13 +395,13 @@ end end using Base.Experimental: @force_compile -@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(:u)) === nothing -@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(Symbol("u", "x"))) === nothing -@test_throws ConcurrencyViolationError("invalid atomic ordering") Core.Intrinsics.atomic_fence(Symbol("u", "x")) === nothing +@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(:u, :system)) === nothing +@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(Symbol("u", "x"), :system)) === nothing +@test_throws ConcurrencyViolationError("invalid atomic ordering") Core.Intrinsics.atomic_fence(Symbol("u", "x"), :system) === nothing for order in (:not_atomic, :monotonic, :acquire, :release, :acquire_release, :sequentially_consistent) - @test Core.Intrinsics.atomic_fence(order) === nothing - @test (order -> Core.Intrinsics.atomic_fence(order))(order) === nothing - @test Base.invokelatest(@eval () -> Core.Intrinsics.atomic_fence($(QuoteNode(order)))) === nothing + @test Core.Intrinsics.atomic_fence(order, :system) === nothing + @test (order -> Core.Intrinsics.atomic_fence(order, :system))(order) === nothing + @test Base.invokelatest(@eval () -> Core.Intrinsics.atomic_fence($(QuoteNode(order)), :system)) === nothing end @test Core.Intrinsics.atomic_pointerref(C_NULL, :sequentially_consistent) === nothing @test (@force_compile; Core.Intrinsics.atomic_pointerref(C_NULL, :sequentially_consistent)) === nothing diff --git a/test/threads_exec.jl b/test/threads_exec.jl index 2780888546964..d9590b76e78c1 100644 --- a/test/threads_exec.jl +++ b/test/threads_exec.jl @@ -464,6 +464,54 @@ function test_fence() end test_fence() +# Test asymmetric thread fences +const asymmetric_test_count = 200_000 +struct AsymmetricFenceTestData + x::AtomicMemory{Int} + y::AtomicMemory{Int} + read_x::AtomicMemory{Int} + read_y::AtomicMemory{Int} +end +function test_asymmetric_fence(data::AsymmetricFenceTestData, cond1, cond2, threadid, it) + if (threadid % 2) == 0 + @atomic :monotonic data.x[it] = 1 + Threads.atomic_fence_heavy() + @atomic :monotonic data.read_y[it] = @atomic :monotonic data.y[it] + wait(cond1) + notify(cond2) + else + @atomic :monotonic data.y[it] = 1 + Threads.atomic_fence_light() + @atomic :monotonic data.read_x[it] = data.x[it] + notify(cond1) + wait(cond2) + end +end +function test_asymmetric_fence(data, cond1, cond2, threadid) + for i = 1:asymmetric_test_count + test_asymmetric_fence(data, cond1, cond2, threadid, i) + end +end +function test_asymmetric_fence() + cond1 = Threads.Event(true) + cond2 = Threads.Event(true) + data = AsymmetricFenceTestData(AtomicMemory{Int}(undef, asymmetric_test_count), + AtomicMemory{Int}(undef, asymmetric_test_count), + AtomicMemory{Int}(undef, asymmetric_test_count), + AtomicMemory{Int}(undef, asymmetric_test_count)) + for i = 1:asymmetric_test_count + @atomic :monotonic data.x[i] = 0 + @atomic :monotonic data.y[i] = 0 + @atomic :monotonic data.read_x[i] = typemax(Int) + @atomic :monotonic data.read_y[i] = typemax(Int) + end + t1 = @Threads.spawn test_asymmetric_fence(data, cond1, cond2, 1) + t2 = @Threads.spawn test_asymmetric_fence(data, cond1, cond2, 2) + wait(t1); wait(t2) + @test !any((data.read_x .== 0) .& (data.read_y .== 0)) +end +test_asymmetric_fence() + # Test load / store with various types let atomictypes = (Int8, Int16, Int32, Int64, Int128, UInt8, UInt16, UInt32, UInt64, UInt128, From 6911fa0c3c3bbafa330feebbd021acf2fcefecd6 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Fri, 5 Dec 2025 00:12:16 +0000 Subject: [PATCH 2/9] Address review --- NEWS.md | 4 ++ base/atomics.jl | 26 +++++----- doc/src/base/multi-threading.md | 2 + src/runtime_intrinsics.c | 2 +- src/signals-mach.c | 24 +++------ src/signals-unix.c | 87 +++++++++++++++++---------------- test/threads_exec.jl | 18 ++++--- 7 files changed, 82 insertions(+), 81 deletions(-) diff --git a/NEWS.md b/NEWS.md index bd72e673728f0..571ad6f37ac80 100644 --- a/NEWS.md +++ b/NEWS.md @@ -21,6 +21,10 @@ Command-line option changes Multi-threading changes ----------------------- + - New functions `Threads.atomic_fence_heavy` and `Threads.atoimc_fence_light` provide support for + asymmetric atomic fences, speeding up atomic synchronization where one side of the synchronization + runs significantly less often than the other ([#60311]). + Build system changes -------------------- diff --git a/base/atomics.jl b/base/atomics.jl index 155cb38d6d03b..21689e06ba1a1 100644 --- a/base/atomics.jl +++ b/base/atomics.jl @@ -10,7 +10,7 @@ export atomic_add!, atomic_sub!, atomic_and!, atomic_nand!, atomic_or!, atomic_xor!, atomic_max!, atomic_min!, - atomic_fence + atomic_fence, atomic_fence_light, atomic_fence_heavy """ Threads.Atomic{T} @@ -334,23 +334,23 @@ atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :system) """ Threads.atomic_fence_light() -This is a read-optimized sequential-consistency memory fence. -On supported operating systems and architectures, this fence is cheaper -than `Threads.atomic_fence()`, but synchronizes only with -[`atomic_fence_heavy`](@ref) calls from other threads. +Insert the light side of an asymmetric sequential-consistency memory fence. +Asymmetric memory fences are useful in scenarios where one side of the +synchronization runs significantly less often than the other side. Use this +function on the side that runs often and [`atomic_fence_heavy`](@ref) on the +side that runs rarely. + +On supported operating systems and architectures this fence is cheaper than +`Threads.atomic_fence()`, but synchronizes only with [`atomic_fence_heavy`](@ref) +calls from other threads. """ atomic_fence_light() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :singlethread) """ Threads.atomic_fence_heavy() -This is a write-optimized sequential-consistency memory fence. -This fence is significantly more expensive than `Threads.atomic_fence`. -It generally requires a system call and a full interprocessor interrupt -to all other processors in the system. It synchronizes with both -[`atomic_fence_light`](@ref) and [`atomic_fence`](@ref) calls from other threads. - -For further details, see the Linux `membarrier` syscall or the Windows -`FlushProcessWriteBuffers` API. +Insert the heavy side of an asymmetric sequential-consistency memory fence. +Use this function on the side that runs rarely. +See [`atomic_fence_light`](@ref) for more details. """ atomic_fence_heavy() = ccall(:jl_membarrier, Cvoid, ()) diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md index 88dc2b7514a2a..0b7d545e54d08 100644 --- a/doc/src/base/multi-threading.md +++ b/doc/src/base/multi-threading.md @@ -50,6 +50,8 @@ Base.Threads.atomic_xor! Base.Threads.atomic_max! Base.Threads.atomic_min! Base.Threads.atomic_fence +Base.Threads.atomic_fence_heavy +Base.Threads.atomic_fence_light ``` ## ccall using a libuv threadpool (Experimental) diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index d03a97e93d14e..c2abb705fd7c6 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -626,13 +626,13 @@ JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym, jl_value_t *sync { JL_TYPECHK(fence, symbol, order_sym); JL_TYPECHK(fence, symbol, syncscope_sym); + enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1); if ((jl_sym_t*)syncscope_sym == jl_singlethread_sym) { asm volatile ("" : : : "memory"); return jl_nothing; } else if ((jl_sym_t*)syncscope_sym != jl_system_sym) { jl_error("atomic_fence: invalid syncscope"); } - enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1); if (order > jl_memory_order_monotonic) jl_fence(); return jl_nothing; diff --git a/src/signals-mach.c b/src/signals-mach.c index e138ca01f90a0..55a430425dce5 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -898,22 +898,20 @@ JL_DLLEXPORT void jl_profile_stop_timer(void) // Copyright (c) .NET Foundation and Contributors // MIT LICENSE JL_DLLEXPORT void jl_membarrier(void) { - mach_msg_type_number_t cThreads; - thread_act_t *pThreads; - kern_return_t machret = task_threads(mach_task_self(), &pThreads, &cThreads); - HANDLE_MACH_ERROR("task_threads()", machret); - uintptr_t sp; uintptr_t registerValues[128]; + kern_return_t machret; // Iterate through each of the threads in the list. - for (mach_msg_type_number_t i = 0; i < cThreads; i++) - { + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + for (int tid = 0; tid < nthreads; tid++) { + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + thread_act_t thread = pthread_mach_thread_np(ptls2->system_id); if (__builtin_available (macOS 10.14, iOS 12, tvOS 9, *)) { // Request the threads pointer values to force the thread to emit a memory barrier size_t registers = 128; - machret = thread_get_register_pointer_values(pThreads[i], &sp, ®isters, registerValues); + machret = thread_get_register_pointer_values(thread, &sp, ®isters, registerValues); } else { @@ -921,11 +919,11 @@ JL_DLLEXPORT void jl_membarrier(void) { #if defined(_CPU_X86_64_) x86_thread_state64_t threadState; mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT; - machret = thread_get_state(pThreads[i], x86_THREAD_STATE64, (thread_state_t)&threadState, &count); + machret = thread_get_state(thread, x86_THREAD_STATE64, (thread_state_t)&threadState, &count); #elif defined(_CPU_AARCH64_) arm_thread_state64_t threadState; mach_msg_type_number_t count = ARM_THREAD_STATE64_COUNT; - machret = thread_get_state(pThreads[i], ARM_THREAD_STATE64, (thread_state_t)&threadState, &count); + machret = thread_get_state(thread, ARM_THREAD_STATE64, (thread_state_t)&threadState, &count); #else #error Unexpected architecture #endif @@ -935,11 +933,5 @@ JL_DLLEXPORT void jl_membarrier(void) { { HANDLE_MACH_ERROR("thread_get_register_pointer_values()", machret); } - - machret = mach_port_deallocate(mach_task_self(), pThreads[i]); - HANDLE_MACH_ERROR("mach_port_deallocate()", machret); } - // Deallocate the thread list now we're done with it. - machret = vm_deallocate(mach_task_self(), (vm_address_t)pThreads, cThreads * sizeof(thread_act_t)); - HANDLE_MACH_ERROR("vm_deallocate()", machret); } diff --git a/src/signals-unix.c b/src/signals-unix.c index 12f49b42d5df6..e62f9d738d8ba 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -1275,48 +1275,15 @@ JL_DLLEXPORT int jl_repl_raise_sigtstp(void) return raise(SIGTSTP); } -// Linux and FreeBSD have compatible membarrier support -#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_) -#if defined(_OS_LINUX_) -# include -# if defined(__has_include) -# if __has_include() -# include -# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__) -# else -# if defined(__NR_membarrier) -enum membarrier_cmd { - MEMBARRIER_CMD_QUERY = 0, - MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), - MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), -}; -# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__) -# else -# warning "Missing linux kernel headers for membarrier syscall, support disabled" -# define membarrier(...) -ENOSYS -# endif -# endif -# else -# include -# endif -#elif defined(_OS_FREEBSD_) -# include -# if __FreeBSD_version >= 1401500 -# include -# else -# define MEMBARRIER_CMD_QUERY 0x00 -# define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 -# define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10 -# define membarrier(...) -ENOSYS -# endif -#endif - +#if !defined(_OS_DARWIN_) // Implementation of the `mprotect` based membarrier fallback. // This is a common fallback based on the observation that `mprotect` happens to // issue the necessary memory barriers. However, there is no spec that -// guarantees this behavior, and indeed AArch64 macos does not. However, we -// only use it as a fallback here for older versions of Linux and FreeBSD where -// we know that it happens to work. +// guarantees this behavior, and indeed AArch64 Darwin does not (so we don't use it +// there). However, we only use it as a fallback here for older versions of +// Linux and FreeBSD where we know that it happens to work. We also use it as a +// fallback for unknown Unix systems under the assumption that it will work, +// but this is not guaranteed. static pthread_mutex_t mprotect_barrier_lock = PTHREAD_MUTEX_INITIALIZER; static _Atomic(uint64_t) *mprotect_barrier_page = NULL; static void jl_init_mprotect_membarrier(void) @@ -1335,7 +1302,7 @@ static void jl_init_mprotect_membarrier(void) } result = mlock(mprotect_barrier_page, pagesize); if (result != 0) { - jl_safe_printf("fatal: failed to mlock barrier page.\n"); + jl_safe_printf("fatal: failed to mlock barrier page (try increasing RLIMIT_MEMLOCK with `ulimit -l`).\n"); abort(); } } @@ -1349,15 +1316,43 @@ static void jl_mprotect_membarrier(void) int result = pthread_mutex_lock(&mprotect_barrier_lock); assert(result == 0); size_t pagesize = jl_getpagesize(); - result = mprotect(mprotect_barrier_page, pagesize, PROT_NONE); + result = mprotect(mprotect_barrier_page, pagesize, PROT_READ | PROT_WRITE); jl_atomic_fetch_add_relaxed(mprotect_barrier_page, 1); assert(result == 0); - result = mprotect(mprotect_barrier_page, pagesize, PROT_READ | PROT_WRITE); + result = mprotect(mprotect_barrier_page, pagesize, PROT_NONE); assert(result == 0); result = pthread_mutex_unlock(&mprotect_barrier_lock); assert(result == 0); (void)result; } +#endif + +// Linux and FreeBSD have compatible membarrier support +#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_) +#if defined(_OS_LINUX_) +# include +# if defined(__NR_membarrier) +enum membarrier_cmd { + MEMBARRIER_CMD_QUERY = 0, + MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), +}; +# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__) +# else +# warning "Missing linux kernel headers for membarrier syscall, support disabled" +# define membarrier(...) (errno = ENOSYS, -1) +# endif +#elif defined(_OS_FREEBSD_) +# include +# if __FreeBSD_version >= 1401500 +# include +# else +# define MEMBARRIER_CMD_QUERY 0x00 +# define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 +# define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10 +# define membarrier(...) (errno = ENOSYS, -1) +# endif +#endif // Implementation of `jl_membarrier` enum membarrier_implementation { @@ -1391,11 +1386,17 @@ JL_DLLEXPORT void jl_membarrier(void) { } if (impl == MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER) { int ret = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0); - assert(ret); + assert(ret == 0); (void)ret; } else { assert(impl == MEMBARRIER_IMPLEMENTATION_MPROTECT); jl_mprotect_membarrier(); } } +#elif !defined(_OS_DARWIN_) +JL_DLLEXPORT void jl_membarrier(void) { + if (!mprotect_barrier_page) + jl_init_mprotect_membarrier(); + jl_mprotect_membarrier(); +} #endif diff --git a/test/threads_exec.jl b/test/threads_exec.jl index d9590b76e78c1..6fb4a9f2b7b89 100644 --- a/test/threads_exec.jl +++ b/test/threads_exec.jl @@ -465,8 +465,8 @@ end test_fence() # Test asymmetric thread fences -const asymmetric_test_count = 200_000 struct AsymmetricFenceTestData + n::Int x::AtomicMemory{Int} y::AtomicMemory{Int} read_x::AtomicMemory{Int} @@ -482,23 +482,25 @@ function test_asymmetric_fence(data::AsymmetricFenceTestData, cond1, cond2, thre else @atomic :monotonic data.y[it] = 1 Threads.atomic_fence_light() - @atomic :monotonic data.read_x[it] = data.x[it] + @atomic :monotonic data.read_x[it] = @atomic :monotonic data.x[it] notify(cond1) wait(cond2) end end -function test_asymmetric_fence(data, cond1, cond2, threadid) - for i = 1:asymmetric_test_count +function test_asymmetric_fence(data::AsymmetricFenceTestData, cond1, cond2, threadid) + for i = 1:data.n test_asymmetric_fence(data, cond1, cond2, threadid, i) end end function test_asymmetric_fence() + asymmetric_test_count = 200_000 cond1 = Threads.Event(true) cond2 = Threads.Event(true) - data = AsymmetricFenceTestData(AtomicMemory{Int}(undef, asymmetric_test_count), - AtomicMemory{Int}(undef, asymmetric_test_count), - AtomicMemory{Int}(undef, asymmetric_test_count), - AtomicMemory{Int}(undef, asymmetric_test_count)) + data = AsymmetricFenceTestData(asymmetric_test_count, + AtomicMemory{Int}(undef, asymmetric_test_count), + AtomicMemory{Int}(undef, asymmetric_test_count), + AtomicMemory{Int}(undef, asymmetric_test_count), + AtomicMemory{Int}(undef, asymmetric_test_count)) for i = 1:asymmetric_test_count @atomic :monotonic data.x[i] = 0 @atomic :monotonic data.y[i] = 0 From 4900004770605aaa86fe7742b51baf81a650aa87 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Fri, 28 Nov 2025 20:36:23 +0000 Subject: [PATCH 3/9] Very WIP: Architecture for robust cancellation This commit is a first sketch for what I would like to do for robust cancellation (i.e. "Making ^C just work"). At this point it's more of a sketch than a real PR, but I think I've done enough of the design for a design discussion. The first thing I should say is that the goals of this PR is very narrowly to make ^C work well. As part of that, we're taking a bit of a step towards structured concurrency, but I am not intending this PR to be a full implementation of that. Given that some of this has been beaten to death in previous issues, I will also not do my usual motivation overview, instead jumping straight into the implementation. As I said, the motivation is just to make ^C work reliably at this point. Broadly when we're trying to cancel a task, it'll be in one of two broad categories: 1. Waiting for some other operation to complete (e.g. an IO operation, another task, an external event, etc.). Here, the actual cancellation itself is not so difficult (after all the task is not running, but suspended in a somehwat well-defined place). However, robust cancellation requires us to potentially propagate the cancellation signal down the wait tree, since the operation we actually want to cancel may not be the root task, but may instead be some operation being performed by the task we're waiting on (and we'd prefer not to leak those operations and have rogue tasks going around performing potentially side-effecting operations). 2. Currently running and doing some computation. The core problem is not really one of propagation (after all the long-running computation is probably what we're wanting to cancel), but rather how to do the cancellation without state corruption. A lot of the crashiness of our existing ^C implementation is just that we would simply inject an exception in places that are not expecting to handle it. For a full solution to the problem, we need to have an answer for both of these points. I will begin with the second, since the first builds upon it. This PR introduces the concept of a `cancellation request` and a `cancellation point`. Each task has a `cancellation_request` field that can be set externally (e.g. by ^C). Any task performing computation should regularly check this field and abort its computation if a cancellation request is pending. For this purpose, the PR provides the `@cancel_check` macro. This macro turns a pending cancellation request into a well-modeled exception. Package authors should insert a call to the macro into any long-running loops. However, there is of course some overhead to the check and it is therefor inappropriate for tight inner loops. We attempt to address this with compiler support. Note that this part is currently incompletely implemented, so the following describes the design rather than the current state of the PR. Consider the cancel_check macro: ``` macro cancel_check() quote local req = Core.cancellation_point!() if req !== nothing throw(conform_cancellation_request(req)) end end end ``` where `cancellation_point!` is a new intrinsic that defines a cancellation point. The compiler is semantically permitted to extend the cancellation point across any following effect_free calls (note for transitivity reasons, the effect is not exactly the same, but is morally equivalent). Upon passing a `cancellation_point!`, the system will set the current task's `reset_ctx` to this cancellation point. If a cancellation request occurs before the `reset_ctx` is cleared, the task's execution will be reset to the nearest cancellation point. I proposed this mechanism in #52291. Additionally, the `reset_ctx` can in principle be used to establish scoped cancellation handlers for external C libraries as well although I suspect that there are not many C libraries that are actually reset safe in the required manner (since allocation is not). Note that `cancellation_point!` is also intended to be a yield point in order to faciliate the ^C mechanism described below. However, this is not currently implemented. Turning our attention now to the first of the two cases mentioned above, we tweak the task's existing `queue` reference to become a generic (atomic) "waitee" reference. The queue is required to be obtainable from with object via the new `waitqueue` generic function. To cancel a `waiter` waiting for a waitable `waitee` object, we 1. Set the waiter's cancellation request 2. Load the `waitee` and call a new generic function `cancel_wait!`, which shall do whatever synchronization and internal bookkeeping is required to remove the task from the wait-queue and then resumes the task. 3. The `waiter` resumes in the wait code. It may now decide how and whether to propagate the cancellation to the object it was just waiting on. Note that this may involve re-queing a wait (to wait for the cancellation of `waitee` to complete). The idea here is that this provides a well-defined context for cancellation-propagation logic to run. I wanted to avoid having any cancellation propagation logic run in parallel with actual wait code. How the cancellation propagates is a bit of a policy question and not one that I fully intend to address in this PR. My plan is to implement a basic state machine that works well for ^C (by requesting safe cancellation immediately and then requesting increasingly unsafe modes of cancellation upon timeout or repeated ^C), but I anticipate that external libraries will want to create their own cancellation request state machines, which the system supports. The implementation is incomplete, so I will not describe it here yet. One may note that there are a significant number of additional fully dynamic dispatches in this scheme (at least `waitqueue` and `cancel_wait!` and possibly in the future). However, note that these dynamic dispatches are confined to the cancellation path, which is not throughput-sensitive (but is latency sensitive). The handling of ^C is delegated to a dedicated task that then gets notified from the signal handler when a SIGINT is received (similar to the existing profile listener) task. There is a little bit of an additional wrinkle in that we need some logic to kick out a computational-task to its nearset cancellation point if we do not have any idle threads. This logic is not yet implemented. ``` julia> sleep(1000) ^CERROR: CancellationRequest: Safe Cancellation (CANCEL_REQUEST_SAFE) Stacktrace: [1] macro expansion @ ./condition.jl:134 [inlined] [2] _trywait(t::Timer) @ Base ./asyncevent.jl:195 [3] wait @ ./asyncevent.jl:204 [inlined] [4] sleep(sec::Int64) @ Base ./asyncevent.jl:322 [5] top-level scope @ REPL[1]:1 julia> function find_collatz_counterexample() i = 1 while true j = i while true @Base.cancel_check j = collatz(j) j == 1 && break j == i && error("$j is a collatz counterexample") end i += 1 end end find_collatz_counterexample (generic function with 1 method) julia> find_collatz_counterexample() ^CERROR: CancellationRequest: Safe Cancellation (CANCEL_REQUEST_SAFE) Stacktrace: [1] macro expansion @ ./condition.jl:134 [inlined] [2] find_collatz_counterexample() @ Main ./REPL[2]:6 [3] top-level scope @ REPL[3]:1 julia> wait(@async sleep(100)) ^CERROR: TaskFailedException Stacktrace: [1] wait(t::Task; throw::Bool) @ Base ./task.jl:367 [2] wait(t::Task) @ Base ./task.jl:360 [3] top-level scope @ REPL[4]:0 [4] macro expansion @ task.jl:729 [inlined] nested task error: CancellationRequest: Safe Cancellation (CANCEL_REQUEST_SAFE) Stacktrace: [1] macro expansion @ ./condition.jl:134 [inlined] [2] _trywait(t::Timer) @ Base ./asyncevent.jl:195 [3] wait @ ./asyncevent.jl:204 [inlined] [4] sleep @ ./asyncevent.jl:322 [inlined] [5] (::var"#2#3")() @ Main ./REPL[4]:1 julia> @sync begin @async sleep(100) @async find_collatz_counterexample() end ^CERROR: nested task error: CancellationRequest: Safe Cancellation (CANCEL_REQUEST_SAFE) Stacktrace: [1] macro expansion @ ./task.jl:1234 [inlined] [2] _trywait(t::Timer) @ Base ~/julia-cancel/usr/share/julia/base/asyncevent.jl:195 [3] wait @ ./asyncevent.jl:203 [inlined] [4] sleep @ ./asyncevent.jl:321 [inlined] [5] (::var"#45#46")() @ Main ./REPL[26]:3 ...and 1 more exception. Stacktrace: [1] sync_cancel!(c::Channel{Any}, t::Task, cr::Any, c_ex::CompositeException) @ Base ~/julia-cancel/usr/share/julia/base/task.jl:1454 [2] sync_end(c::Channel{Any}) @ Base ~/julia-cancel/usr/share/julia/base/task.jl:608 [3] macro expansion @ ./task.jl:663 [inlined] [4] (::var"#43#44")() @ Main ./REPL[5] ``` As noted above, the `@Base.cancel_check` is not intended to be required in the inner loop. Rather, the compiler is expected to extend the cancelation point from the start of the loop to the entire function. However, this is not yet implemented. --- base/Base.jl | 30 +++++ base/asyncevent.jl | 1 + base/condition.jl | 37 +++++- base/linked_list.jl | 44 +++++-- base/task.jl | 215 ++++++++++++++++++++++++++++++++-- src/builtin_proto.h | 1 + src/builtins.c | 10 ++ src/codegen.cpp | 56 +++++++++ src/jltypes.c | 14 ++- src/julia_locks.h | 1 + src/julia_threads.h | 13 +- src/llvm-late-gc-lowering.cpp | 2 +- src/llvm-pass-helpers.cpp | 3 +- src/llvm-pass-helpers.h | 1 + src/signal-handling.c | 18 +++ src/signals-unix.c | 17 ++- src/task.c | 2 + stdlib/REPL/src/REPL.jl | 4 + 18 files changed, 425 insertions(+), 44 deletions(-) diff --git a/base/Base.jl b/base/Base.jl index 57d9915239fdf..eeb047125ce34 100644 --- a/base/Base.jl +++ b/base/Base.jl @@ -371,6 +371,35 @@ function start_profile_listener() ccall(:jl_set_peek_cond, Cvoid, (Ptr{Cvoid},), cond.handle) end +function sigint_listener(cond::AsyncCondition) + while _trywait(cond) + # The SIGINT handler should have set a cancellation request on the roottask + cr = @atomic :acquire roottask.cancellation_request + cr === nothing && continue + cancel!(roottask, cr) + end + nothing +end + +function start_sigint_listener() + cond = AsyncCondition() + uv_unref(cond.handle) + t = errormonitor(Threads.@spawn(sigint_listener(cond))) + atexit() do + # destroy this callback when exiting + ccall(:jl_set_sigint_cond, Cvoid, (Ptr{Cvoid},), C_NULL) + # this will prompt any ongoing or pending event to flush also + close(cond) + # error-propagation is not needed, since the errormonitor will handle printing that better + t === current_task() || _wait(t) + end + finalizer(cond) do c + # if something goes south, still make sure we aren't keeping a reference in C to this + ccall(:jl_set_sigint_cond, Cvoid, (Ptr{Cvoid},), C_NULL) + end + ccall(:jl_set_sigint_cond, Cvoid, (Ptr{Cvoid},), cond.handle) +end + function __init__() # Base library init global _atexit_hooks_finished = false @@ -394,6 +423,7 @@ function __init__() # triggering a profile via signals is not implemented on windows start_profile_listener() end + start_sigint_listener() _require_world_age[] = get_world_counter() # Prevent spawned Julia process from getting stuck waiting on Tracy to connect. delete!(ENV, "JULIA_WAIT_FOR_TRACY") diff --git a/base/asyncevent.jl b/base/asyncevent.jl index 68ae27049adc0..fa9bb520cc855 100644 --- a/base/asyncevent.jl +++ b/base/asyncevent.jl @@ -192,6 +192,7 @@ function _trywait(t::Union{Timer, AsyncCondition}) unlock(t.cond) unpreserve_handle(t) end + @cancel_check() end iolock_end() end diff --git a/base/condition.jl b/base/condition.jl index fd771c9be346a..fff9943966268 100644 --- a/base/condition.jl +++ b/base/condition.jl @@ -69,6 +69,8 @@ struct GenericCondition{L<:AbstractLock} GenericCondition(l::AbstractLock) = new{typeof(l)}(IntrusiveLinkedList{Task}(), l) end +waitqueue(c::GenericCondition) = ILLRef(c.waitq, c) + show(io::IO, c::GenericCondition) = print(io, GenericCondition, "(", c.lock, ")") assert_havelock(c::GenericCondition) = assert_havelock(c.lock) @@ -84,9 +86,9 @@ function _wait2(c::GenericCondition, waiter::Task, first::Bool=false) ct = current_task() assert_havelock(c) if first - pushfirst!(c.waitq, waiter) + pushfirst!(waitqueue(c), waiter) else - push!(c.waitq, waiter) + push!(waitqueue(c), waiter) end # since _wait2 is similar to schedule, we should observe the sticky bit now if waiter.sticky && Threads.threadid(waiter) == 0 && !GC.in_finalizer() @@ -125,6 +127,15 @@ proceeding. """ function wait end +macro cancel_check() + quote + local req = Core.cancellation_point!() + if req !== nothing + throw(conform_cancellation_request(req)) + end + end +end + """ wait(c::GenericCondition; first::Bool=false) @@ -133,18 +144,32 @@ Wait for [`notify`](@ref) on `c` and return the `val` parameter passed to `notif If the keyword `first` is set to `true`, the waiter will be put _first_ in line to wake up on `notify`. Otherwise, `wait` has first-in-first-out (FIFO) behavior. """ -function wait(c::GenericCondition; first::Bool=false) +function wait(c::GenericCondition; first::Bool=false, cancel_check::Bool=true) ct = current_task() _wait2(c, ct, first) token = unlockall(c.lock) try return wait() catch - q = ct.queue; q === nothing || Base.list_deletefirst!(q::IntrusiveLinkedList{Task}, ct) + q = ct.queue; q === c && Base.list_deletefirst!(waitqueue(c), ct) rethrow() finally relockall(c.lock, token) end + cancel_check && @cancel_check() +end + +function cancel_wait!(c::GenericCondition, t::Task) + @assert (@atomic :monotonic t.cancellation_request) !== nothing + lock(c) + if t.queue !== c + unlock(c) + return false + end + Base.list_deletefirst!(waitqueue(c), t) + enq_work(t) + unlock(c) + return true end """ @@ -160,8 +185,8 @@ Return the count of tasks woken up. Return 0 if no tasks are waiting on `conditi function notify(c::GenericCondition, @nospecialize(arg), all, error) assert_havelock(c) cnt = 0 - while !isempty(c.waitq) - t = popfirst!(c.waitq) + while !isempty(waitqueue(c)) + t = popfirst!(waitqueue(c)) schedule(t, arg, error=error) cnt += 1 all || break diff --git a/base/linked_list.jl b/base/linked_list.jl index c477dc56bdb2b..cf3be4115f73d 100644 --- a/base/linked_list.jl +++ b/base/linked_list.jl @@ -1,12 +1,18 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license mutable struct IntrusiveLinkedList{T} - # Invasive list requires that T have a field `.next >: U{T, Nothing}` and `.queue >: U{ILL{T}, Nothing}` + # Invasive list requires that T have a field `.next >: U{T, Nothing}` and `.queue::Any` head::Union{T, Nothing} tail::Union{T, Nothing} IntrusiveLinkedList{T}() where {T} = new{T}(nothing, nothing) end +struct ILLRef{T} + list::IntrusiveLinkedList{T} + waitee::Any # Invariant: waitqueue(waitee) === list +end +waitqueue(list::IntrusiveLinkedList{T}) where {T} = ILLRef(list, list) + #const list_append!! = append! #const list_deletefirst! = delete! @@ -49,9 +55,13 @@ function list_append!!(q::IntrusiveLinkedList{T}, q2::IntrusiveLinkedList{T}) wh return q end -function push!(q::IntrusiveLinkedList{T}, val::T) where T +isempty(qr::ILLRef{T}) where T = isempty(qr.list) +length(qr::ILLRef{T}) where T = length(qr.list) + +function push!(qr::ILLRef{T}, val::T) where T val.queue === nothing || error("val already in a list") - val.queue = q + val.queue = qr.waitee + q = qr.list tail = q.tail if tail === nothing q.head = q.tail = val @@ -62,9 +72,10 @@ function push!(q::IntrusiveLinkedList{T}, val::T) where T return q end -function pushfirst!(q::IntrusiveLinkedList{T}, val::T) where T +function pushfirst!(qr::ILLRef{T}, val::T) where T val.queue === nothing || error("val already in a list") - val.queue = q + val.queue = qr.waitee + q = qr.list head = q.head if head === nothing q.head = q.tail = val @@ -75,21 +86,23 @@ function pushfirst!(q::IntrusiveLinkedList{T}, val::T) where T return q end -function pop!(q::IntrusiveLinkedList{T}) where {T} - val = q.tail::T +function pop!(qr::ILLRef{T}) where {T} + val = qr.list.tail::T list_deletefirst!(q, val) # expensive! return val end -function popfirst!(q::IntrusiveLinkedList{T}) where {T} - val = q.head::T - list_deletefirst!(q, val) # cheap +function popfirst!(qr::ILLRef{T}) where {T} + val = qr.list.head::T + list_deletefirst!(qr, val) # cheap return val end # this function assumes `val` is found in `q` -function list_deletefirst!(q::IntrusiveLinkedList{T}, val::T) where T - val.queue === q || return +function list_deletefirst!(qr::ILLRef{T}, val::T) where T +# (val.queue === qr.waitee || +# val.queue === qr.list) || throw(ConcurrencyViolationError("attempt to delete from wrong list")) + q = qr.list head = q.head::T if head === val if q.tail::T === val @@ -115,6 +128,13 @@ function list_deletefirst!(q::IntrusiveLinkedList{T}, val::T) where T return q end +# TODO: Delete this compatibility wrapper +list_deletefirst!(q::IntrusiveLinkedList{T}, val::T) where T = list_deletefirst!(ILLRef(q, q), val) +push!(q::IntrusiveLinkedList{T}, val::T) where T = push!(ILLRef(q, q), val) +pushfirst!(q::IntrusiveLinkedList{T}, val::T) where T = pushfirst!(ILLRef(q, q), val) +pop!(q::IntrusiveLinkedList{T}) where T = pop!(ILLRef(q, q)) +popfirst!(q::IntrusiveLinkedList{T}) where T = popfirst!(ILLRef(q, q)) + #function list_deletefirst!(q::Array{T}, val::T) where T # i = findfirst(isequal(val), q) # i === nothing || deleteat!(q, i) diff --git a/base/task.jl b/base/task.jl index 244a8f70a768a..077d571a5fb6f 100644 --- a/base/task.jl +++ b/base/task.jl @@ -302,14 +302,14 @@ function task_local_storage(body::Function, key, val) end # just wait for a task to be done, no error propagation -function _wait(t::Task) +function _wait(t::Task; expected_cancellation = nothing) t === current_task() && Core.throw(ConcurrencyViolationError("deadlock detected: cannot wait on current task")) if !istaskdone(t) donenotify = t.donenotify::ThreadSynchronizer lock(donenotify) try - while !istaskdone(t) - wait(donenotify) + while !istaskdone(t) && cancellation_request() === expected_cancellation + wait(donenotify; cancel_check=false) end finally unlock(donenotify) @@ -359,12 +359,28 @@ Throws a `ConcurrencyViolationError` if `t` is the currently running task, to pr """ function wait(t::Task; throw=true) _wait(t) + cr = cancellation_request() + if cr !== nothing + propagate_cancellation!(t, cr) + end if throw && istaskfailed(t) Core.throw(TaskFailedException(t)) end nothing end +""" + wait_nocancel(t::Task) + +Like `wait`, but do not propagate cancellation of this task to the waited-on task. +""" +function wait_nocancel(t::Task; throw=true) + _wait(t) + if throw && istaskfailed(t) + Core.throw(TaskFailedException(t)) + end +end + # Wait multiple tasks """ @@ -587,6 +603,10 @@ function sync_end(c::Channel{Any}) r = take!(c) if isa(r, Task) _wait(r) + cr = cancellation_request() + if cr !== nothing + return sync_cancel!(c, r, cr, @isdefined(c_ex) ? c_ex : CompositeException()) + end if istaskfailed(r) if !@isdefined(c_ex) c_ex = CompositeException() @@ -899,12 +919,13 @@ mutable struct IntrusiveLinkedListSynchronized{T} lock::Threads.SpinLock IntrusiveLinkedListSynchronized{T}() where {T} = new(IntrusiveLinkedList{T}(), Threads.SpinLock()) end -isempty(W::IntrusiveLinkedListSynchronized) = isempty(W.queue) -length(W::IntrusiveLinkedListSynchronized) = length(W.queue) +waitqueue(l::IntrusiveLinkedListSynchronized) = ILLRef(l.queue, l) +isempty(W::IntrusiveLinkedListSynchronized) = isempty(waitqueue(W)) +length(W::IntrusiveLinkedListSynchronized) = length(waitqueue(W)) function push!(W::IntrusiveLinkedListSynchronized{T}, t::T) where T lock(W.lock) try - push!(W.queue, t) + push!(waitqueue(W), t) finally unlock(W.lock) end @@ -913,7 +934,7 @@ end function pushfirst!(W::IntrusiveLinkedListSynchronized{T}, t::T) where T lock(W.lock) try - pushfirst!(W.queue, t) + pushfirst!(waitqueue(W), t) finally unlock(W.lock) end @@ -922,7 +943,7 @@ end function pop!(W::IntrusiveLinkedListSynchronized) lock(W.lock) try - return pop!(W.queue) + return pop!(waitqueue(W)) finally unlock(W.lock) end @@ -930,7 +951,7 @@ end function popfirst!(W::IntrusiveLinkedListSynchronized) lock(W.lock) try - return popfirst!(W.queue) + return popfirst!(waitqueue(W)) finally unlock(W.lock) end @@ -938,7 +959,7 @@ end function list_deletefirst!(W::IntrusiveLinkedListSynchronized{T}, t::T) where T lock(W.lock) try - list_deletefirst!(W.queue, t) + list_deletefirst!(waitqueue(W), t) finally unlock(W.lock) end @@ -953,7 +974,10 @@ workqueue_for(tid::Int) = Workqueues[tid] function enq_work(t::Task) (t._state === task_state_runnable && t.queue === nothing) || error("schedule: Task not runnable") + _enq_work(t) +end +function _enq_work(t::Task) # Sticky tasks go into their thread's work queue. if t.sticky tid = Threads.threadid(t) @@ -1272,3 +1296,174 @@ function maybe_record_enqueued!(t::Task) end return t end + +## Cancellation + +struct CancellationRequest + request::UInt8 +end + +""" + CANCEL_REQUEST_SAFE + +Request safe cancelation of the current task. If the task is waiting for any +other resources, it will request safe cancellation of any such resources and +wait for the cancellation of such resources to be completed. + +As a result, if either the task itself or any of its dependent resources are +currently unable to process cancelation, the request may hang and a more +aggressive cancelation method may be required. However, in general _SAFE +should be tried first. +""" +const CANCEL_REQUEST_SAFE = CancellationRequest(0x0) + +""" + CANCEL_REQUEST_ACK + +Set by the task itself to indicate that a (safe) cancellation request was +received and acknowledged, but that there are dependent tasks for whom +cancelation is still pending. +""" +const CANCEL_REQUEST_ACK = CancellationRequest(0x1) + +""" + CANCEL_REQUEST_QUERY + +Request that the system create an asynchronous report of why the task is currently +not able to be canceled. The report will be provided in the ->cancelation_request +field of the current task (as long as this field is still CANCEL_REQUEST_QUERY). + +N.B.: Transition to CANCEL_REQUEST_QUERY is only allowed from CANCEL_REQUEST_ACK. + Once the waiting task has read the cancelation report, it may set the cancelation + request back to CANCEL_REQUEST_ACK. +""" +const CANCEL_REQUEST_QUERY = CancellationRequest(0x2) + +""" + CANCEL_REQUEST_ABANDON_EXTERNAL + +Request a cancelation that will cease waiting for any external resources (e.g. I/O objects) +without going through a safe cancelation procedure for such resources. However, the +task will wait for any internal computational tasks to complete cancelation. + +This is a middleground between CANCEL_REQUEST_SAFE and CANCEL_REQUEST_ABANDON_ALL. As external +I/O is often engineered for robustness in case of sudden disapperance of peers +""" +const CANCEL_REQUEST_ABANDON_EXTERNAL = CancellationRequest(0x3) + +""" + CANCEL_REQUEST_ABANDON_ALL + +Request a cancelation that will cease waiting for all external resources and all unacknowledged +internal tasks. Such tasks will be frozen and become unschedulable in the future. + +!!! warning + If any canceled task has acquired locks or other resources that are contested, this method of + cancelation may leak such resources and create deadlocks in future code. It is intended as a + last-resort method to recover a system, but the necessity of this operation should in general + be considered a bug (e.g. due to insufficient cancellation points in computationally-heavy code). +""" +const CANCEL_REQUEST_ABANDON_ALL = CancellationRequest(0x4) + +function Base.showerror(io::IO, cr::CancellationRequest) + print(io, "CancellationRequest: ") + if cr === CANCEL_REQUEST_SAFE + print(io, "Safe Cancellation (CANCEL_REQUEST_SAFE)") + else + print(io, "Unknown ($(cr.request))") + end +end + +function conform_cancellation_request(@nospecialize(cr)) + if isa(cr, UInt8) + return CancellationRequest(cr) + end + return cr +end + +""" + cancellation_request() + +Returns the cancellation request for the current task or `nothing` if no +cancellation has been requested. If a cancellation request is present, it is +loaded with acquire semantics. +""" +function cancellation_request() + ct = current_task() + req = @atomic :monotonic ct.cancellation_request + req === nothing && return req + cr = @atomic :acquire ct.cancellation_request + return conform_cancellation_request(cr) +end + +""" + Core.cancellation_point!() + +Like [`cancellation_request`](@ref), but additionally gives the optimizer license +to establish this point as a cancellation reset point. If safe to do, the runtime +will attempt to unwind execution to the nearest preceeding cancellation point +when a cancellation is requested. +""" +Core.cancellation_point! + +function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE) + @atomic :release t.cancellation_request = crequest + while !istaskdone(t) + waitee = t.queue + waitee === nothing && (yield(); continue) + invokelatest(cancel_wait!, waitee, t) && break + end +end + +""" + Base.reset_cancellation!() + +Resets the cancellation status of the current task. +This should only be used from the root task after normal operation has been +resumed (e.g. by returning control to the user). +""" +function reset_cancellation!() + ct = current_task() + @assert ct === roottask + @atomic :release ct.cancellation_request = nothing +end + +function propagate_cancellation!(t::Task, crequest) + if crequest != CANCEL_REQUEST_SAFE + error("Not yet supported") + end + cancel!(t, crequest) + _wait(t; expected_cancellation=crequest) +end + +@noinline function sync_cancel!(c::Channel{Any}, t::Task, @nospecialize(cr), c_ex::CompositeException) + if cr !== CANCEL_REQUEST_SAFE + error("Not yet supported") + end + waitees = Any[t] + cancel!(t, cr) + while isready(c) + r = take!(c) + cancel!(r, cr) + push!(waitees, r) + end + close(c) + for r in waitees + if isa(r, Task) + _wait(r; expected_cancellation=cr) + if istaskfailed(r) + push!(c_ex, TaskFailedException(r)) + end + else + try + wait(r) + catch e + push!(c_ex, e) + end + end + end + if !isempty(c_ex) + throw(c_ex) + end + return nothing +end diff --git a/src/builtin_proto.h b/src/builtin_proto.h index ff634149a06f6..ee0d8fca8cd8a 100644 --- a/src/builtin_proto.h +++ b/src/builtin_proto.h @@ -76,6 +76,7 @@ extern "C" { XX(tuple,"tuple") \ XX(typeassert,"typeassert") \ XX(typeof,"typeof") \ + XX(cancellation_point,"cancellation_point!") #define DECLARE_BUILTIN(cname,jlname) \ JL_CALLABLE(jl_f_##cname); diff --git a/src/builtins.c b/src/builtins.c index 7b29580c76086..80ca3f3c23a1e 100644 --- a/src/builtins.c +++ b/src/builtins.c @@ -2496,6 +2496,16 @@ JL_CALLABLE(jl_f_intrinsic_call) abort(); } +JL_CALLABLE(jl_f_cancellation_point) +{ + JL_NARGS(cancellation_point, 0, 0); + jl_task_t *ct = jl_current_task; + jl_value_t *cr = jl_atomic_load_relaxed(&ct->cancellation_request); + if (cr == NULL || cr == jl_nothing) + return jl_nothing; + return jl_atomic_load_acquire(&ct->cancellation_request); +} + JL_DLLEXPORT const char *jl_intrinsic_name(int f) { switch ((enum intrinsic)f) { diff --git a/src/codegen.cpp b/src/codegen.cpp index ed427f999e8e3..c193da2c436d6 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1197,6 +1197,17 @@ static const auto jl_write_barrier_func = new JuliaFunction<>{ }, }; +static const auto jl_cancellation_point_func = new JuliaFunction<>{ + "julia.cancellation_point", + [](LLVMContext &C) { + return FunctionType::get(getInt32Ty(C), {}, false); + }, + [](LLVMContext &C) { return AttributeList::get(C, + Attributes(C, {Attribute::ReturnsTwice}), + AttributeSet(), + None); } +}; + static const auto jlisa_func = new JuliaFunction<>{ XSTR(jl_isa), [](LLVMContext &C) { @@ -4943,6 +4954,51 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f, return true; } + else if (f == BUILTIN(cancellation_point) && nargs == 0) { + // Emit the cancellation point intrinsic call first + ctx.builder.CreateCall(prepare_call(jl_cancellation_point_func)); + + // Now do the same as the runtime version: + // 1. Load cancellation_request with relaxed ordering for fast path check + Value *ct = get_current_task(ctx); + Value *cr_ptr = emit_ptrgep(ctx, ct, offsetof(jl_task_t, cancellation_request), "cancellation_request"); + jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe); + LoadInst *cr_relaxed = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, cr_ptr, ctx.types().alignof_ptr); + cr_relaxed->setOrdering(AtomicOrdering::Monotonic); + ai.decorateInst(cr_relaxed); + + // 2. Check if cr == NULL || cr == jl_nothing + Value *is_null = ctx.builder.CreateIsNull(cr_relaxed); + Value *nothing_val = literal_pointer_val(ctx, jl_nothing); + Value *is_nothing = ctx.builder.CreateICmpEQ(decay_derived(ctx, cr_relaxed), decay_derived(ctx, nothing_val)); + Value *no_cancel = ctx.builder.CreateOr(is_null, is_nothing); + + // Save current basic block before branching + BasicBlock *currBB = ctx.builder.GetInsertBlock(); + + // Create basic blocks for the branch + BasicBlock *has_cancel_bb = BasicBlock::Create(ctx.builder.getContext(), "has_cancellation", ctx.f); + BasicBlock *merge_bb = BasicBlock::Create(ctx.builder.getContext(), "cancellation_merge", ctx.f); + + ctx.builder.CreateCondBr(no_cancel, merge_bb, has_cancel_bb); + + // In the has_cancel case, do an acquire load + ctx.builder.SetInsertPoint(has_cancel_bb); + LoadInst *cr_acquire = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, cr_ptr, ctx.types().alignof_ptr); + cr_acquire->setOrdering(AtomicOrdering::Acquire); + ai.decorateInst(cr_acquire); + ctx.builder.CreateBr(merge_bb); + + // Merge the results + ctx.builder.SetInsertPoint(merge_bb); + PHINode *result = ctx.builder.CreatePHI(ctx.types().T_prjlvalue, 2); + result->addIncoming(nothing_val, currBB); + result->addIncoming(cr_acquire, has_cancel_bb); + + *ret = mark_julia_type(ctx, result, /*boxed*/ true, rt); + return true; + } + return false; } diff --git a/src/jltypes.c b/src/jltypes.c index db75be1c9db0a..bb5a0beb98494 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -3767,7 +3767,7 @@ void jl_init_types(void) JL_GC_DISABLED NULL, jl_any_type, jl_emptysvec, - jl_perm_symsvec(27, + jl_perm_symsvec(28, "next", "queue", "storage", @@ -3794,8 +3794,9 @@ void jl_init_types(void) JL_GC_DISABLED "first_enqueued_at", "last_started_running_at", "running_time_ns", - "finished_at"), - jl_svec(27, + "finished_at", + "cancellation_request"), + jl_svec(28, jl_any_type, jl_any_type, jl_any_type, @@ -3822,16 +3823,17 @@ void jl_init_types(void) JL_GC_DISABLED jl_uint64_type, jl_uint64_type, jl_uint64_type, - jl_uint64_type), + jl_uint64_type, + jl_any_type), jl_emptysvec, 0, 1, 6); XX(task); jl_value_t *listt = jl_new_struct(jl_uniontype_type, jl_task_type, jl_nothing_type); jl_svecset(jl_task_type->types, 0, listt); // Set field 20 (metrics_enabled) as const - // Set fields 8 (_state) and 24-27 (metric counters) as atomic + // Set fields 8 (_state) and 24-27 (metric counters), 28 (cancellation_request) as atomic const static uint32_t task_constfields[1] = { 0b00000000000010000000000000000000 }; - const static uint32_t task_atomicfields[1] = { 0b00000111100000000000000010000000 }; + const static uint32_t task_atomicfields[1] = { 0b00001111100000000000000010000000 }; jl_task_type->name->constfields = task_constfields; jl_task_type->name->atomicfields = task_atomicfields; diff --git a/src/julia_locks.h b/src/julia_locks.h index a4b5fd96b8fb4..4fe75543ad6b9 100644 --- a/src/julia_locks.h +++ b/src/julia_locks.h @@ -104,6 +104,7 @@ static inline void jl_mutex_init(jl_mutex_t *lock, const char *name) JL_NOTSAFEP #define JL_LOCK(m) jl_mutex_lock(m) #define JL_UNLOCK(m) jl_mutex_unlock(m) #define JL_LOCK_NOGC(m) jl_mutex_lock_nogc(m) +#define JL_TRYLOCK_NOGC(m) jl_mutex_trylock_nogc(m) #define JL_UNLOCK_NOGC(m) jl_mutex_unlock_nogc(m) JL_DLLEXPORT void jl_lock_value(jl_mutex_t *v) JL_NOTSAFEPOINT; diff --git a/src/julia_threads.h b/src/julia_threads.h index 364931e43d2e9..df6a095baf4ec 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -225,8 +225,11 @@ typedef struct _jl_handler_t jl_handler_t; typedef struct _jl_task_t { JL_DATA_TYPE - jl_value_t *next; // invasive linked list for scheduler - jl_value_t *queue; // invasive linked list for scheduler + // This invasive linked list is used by the scheduler. The fields are protected + // by the lock of the parent object of the containing queue. + jl_value_t *next; + jl_value_t *queue; + jl_value_t *tls; jl_value_t *donenotify; jl_value_t *result; @@ -252,6 +255,9 @@ typedef struct _jl_task_t { // timestamp this task finished (i.e. entered state DONE or FAILED). _Atomic(uint64_t) finished_at; + // Cancellation request - can be an arbitary julia value, but the runtime recognizes + // CANCEL_REQUEST_ enum values. + _Atomic(jl_value_t *) cancellation_request; // hidden state: // id of owning thread - does not need to be defined until the task runs @@ -280,6 +286,9 @@ typedef struct _jl_task_t { jl_handler_t *eh; // saved thread state jl_ucontext_t ctx; // pointer into stkbuf, if suspended + // current reset point for cancellation. Technically, we only need volatile + // here, but _Atomic makes the intent clearer. + volatile _jl_ucontext_t *reset_ctx; } jl_task_t; JL_DLLEXPORT void *jl_get_ptls_states(void); diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index ae1351ae41ca1..90334e60251da 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2068,7 +2068,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { } Value *callee = CI->getCalledOperand(); if (callee && (callee == gc_flush_func || callee == gc_preserve_begin_func - || callee == gc_preserve_end_func)) { + || callee == gc_preserve_end_func || callee == cancel_point_func)) { /* No replacement */ } else if (pointer_from_objref_func != nullptr && callee == pointer_from_objref_func) { auto *obj = CI->getOperand(0); diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index a6a16a7f4956c..ed288328ea908 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -32,7 +32,7 @@ JuliaPassContext::JuliaPassContext() gc_preserve_begin_func(nullptr), gc_preserve_end_func(nullptr), pointer_from_objref_func(nullptr), gc_loaded_func(nullptr), alloc_obj_func(nullptr), typeof_func(nullptr), write_barrier_func(nullptr), pop_handler_noexcept_func(nullptr), - call_func(nullptr), call2_func(nullptr), call3_func(nullptr), module(nullptr) + call_func(nullptr), call2_func(nullptr), call3_func(nullptr), cancel_point_func(nullptr), module(nullptr) { } @@ -61,6 +61,7 @@ void JuliaPassContext::initFunctions(Module &M) call_func = M.getFunction("julia.call"); call2_func = M.getFunction("julia.call2"); call3_func = M.getFunction("julia.call3"); + cancel_point_func = M.getFunction("julia.cancellation_point"); } void JuliaPassContext::initAll(Module &M) diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h index d79470818c287..d9c96e6451595 100644 --- a/src/llvm-pass-helpers.h +++ b/src/llvm-pass-helpers.h @@ -64,6 +64,7 @@ struct JuliaPassContext { llvm::Function *call_func; llvm::Function *call2_func; llvm::Function *call3_func; + llvm::Function *cancel_point_func; // Creates a pass context. Type and function pointers // are set to `nullptr`. Metadata nodes are initialized. diff --git a/src/signal-handling.c b/src/signal-handling.c index 1da687654dd81..39eb3ceedd53b 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -441,6 +441,24 @@ static void jl_check_profile_autostop(void) } } +jl_mutex_t sigint_cond_lock; +static uv_async_t *sigint_cond_loc; +JL_DLLEXPORT void jl_set_sigint_cond(uv_async_t *cond) +{ + JL_LOCK_NOGC(&sigint_cond_lock); + sigint_cond_loc = cond; + JL_UNLOCK_NOGC(&sigint_cond_lock); +} + +static void deliver_sigint_notification(void) +{ + if (JL_TRYLOCK_NOGC(&sigint_cond_lock)) { + if (sigint_cond_loc != NULL) + uv_async_send(sigint_cond_loc); + JL_UNLOCK_NOGC(&sigint_cond_lock); + } +} + static void stack_overflow_warning(void) { jl_safe_printf("Warning: detected a stack overflow; program state may be corrupted, so further execution might be unreliable.\n"); diff --git a/src/signals-unix.c b/src/signals-unix.c index e62f9d738d8ba..ec34d97cd89bc 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -418,7 +418,6 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context) } else if (jl_safepoint_consume_sigint()) { jl_clear_force_sigint(); - jl_throw_in_ctx(ct, jl_interrupt_exception, sig, context); } return; } @@ -989,14 +988,20 @@ static void *signal_listener(void *arg) #endif if (sig == SIGINT) { - if (jl_ignore_sigint()) { - continue; - } - else if (exit_on_sigint) { + if (exit_on_sigint) { critical = 1; } else { - jl_try_deliver_sigint(); + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; + // Set the cancellation request, then notify the sigint listener + // that we want to cancel - if the task is not currently running, + // the sigint listener will take care of safely moving us through + // the cancellation state machine. + // TODO: If there is only one thread, we may need to ask the currently + // running task to yield, so that the sigint listener can run. + jl_atomic_store_release(&ptls2->root_task->cancellation_request, + jl_box_uint8(0x00)); + deliver_sigint_notification(); continue; } } diff --git a/src/task.c b/src/task.c index 18d21b2343053..68ede8ff55e90 100644 --- a/src/task.c +++ b/src/task.c @@ -1151,6 +1151,7 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_value_t *start, jl_value_t *completion_fu jl_atomic_store_relaxed(&t->running_time_ns, 0); jl_atomic_store_relaxed(&t->finished_at, 0); jl_timing_task_init(t); + jl_atomic_store_relaxed(&t->cancellation_request, jl_nothing); if (t->ctx.copy_stack) t->ctx.copy_ctx = NULL; @@ -1603,6 +1604,7 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi) jl_atomic_store_relaxed(&ct->first_enqueued_at, 0); jl_atomic_store_relaxed(&ct->last_started_running_at, 0); } + jl_atomic_store_relaxed(&ct->cancellation_request, jl_nothing); ptls->root_task = ct; jl_atomic_store_relaxed(&ptls->current_task, ct); JL_GC_PROMISE_ROOTED(ct); diff --git a/stdlib/REPL/src/REPL.jl b/stdlib/REPL/src/REPL.jl index 0c31315e9bea1..2eb8168b47d04 100644 --- a/stdlib/REPL/src/REPL.jl +++ b/stdlib/REPL/src/REPL.jl @@ -451,6 +451,10 @@ function repl_backend_loop(backend::REPLBackend, get_module::Function) while true tls = task_local_storage() tls[:SOURCE_PATH] = nothing + # TODO: Support cancellation scopes for non-root tasks + if current_task() === Base.roottask + Base.reset_cancellation!() + end ast_or_func, show_value = take!(backend.repl_channel) if show_value == -1 # exit flag From 29f503b43ff97c35bd927caf20a34ada774dae4f Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Wed, 3 Dec 2025 01:09:34 +0000 Subject: [PATCH 4/9] WIP --- Compiler/src/effects.jl | 53 ++++++++++++----- Compiler/src/ssair/show.jl | 4 +- base/asyncevent.jl | 10 +++- base/condition.jl | 36 +++++++----- base/experimental.jl | 2 +- base/linked_list.jl | 14 ++++- base/task.jl | 117 ++++++++++++++++++++++++++++++++----- src/codegen.cpp | 2 +- src/signal-handling.c | 7 ++- src/task.c | 24 ++++++++ test/cancellation.jl | 78 +++++++++++++++++++++++++ 11 files changed, 297 insertions(+), 50 deletions(-) create mode 100644 test/cancellation.jl diff --git a/Compiler/src/effects.jl b/Compiler/src/effects.jl index 3db778943f10a..4879a9981524a 100644 --- a/Compiler/src/effects.jl +++ b/Compiler/src/effects.jl @@ -11,28 +11,32 @@ The output represents the state of different effect properties in the following - `+e` (green): `ALWAYS_TRUE` - `-e` (red): `ALWAYS_FALSE` - `?e` (yellow): `EFFECT_FREE_IF_INACCESSIBLEMEMONLY` -3. `nothrow` (`n`): +3. `reset_safe` (`re`): + - `+re` (green): `ALWAYS_TRUE` + - `-re` (red): `ALWAYS_FALSE` + - `?re` (yellow): `RESET_SAFE_IF_INACCESSIBLEMEMONLY` +4. `nothrow` (`n`): - `+n` (green): `true` - `-n` (red): `false` -4. `terminates` (`t`): +5. `terminates` (`t`): - `+t` (green): `true` - `-t` (red): `false` -5. `notaskstate` (`s`): +6. `notaskstate` (`s`): - `+s` (green): `true` - `-s` (red): `false` -6. `inaccessiblememonly` (`m`): +7. `inaccessiblememonly` (`m`): - `+m` (green): `ALWAYS_TRUE` - `-m` (red): `ALWAYS_FALSE` - `?m` (yellow): `INACCESSIBLEMEM_OR_ARGMEMONLY` -7. `noub` (`u`): +8. `noub` (`u`): - `+u` (green): `true` - `-u` (red): `false` - `?u` (yellow): `NOUB_IF_NOINBOUNDS` -8. `:nonoverlayed` (`o`): +9. `:nonoverlayed` (`o`): - `+o` (green): `ALWAYS_TRUE` - `-o` (red): `ALWAYS_FALSE` - `?o` (yellow): `CONSISTENT_OVERLAY` -9. `:nortcall` (`r`): +10. `:nortcall` (`r`): - `+r` (green): `true` - `-r` (red): `false` """ @@ -62,6 +66,10 @@ following meanings: will not be refined anyway. * `EFFECT_FREE_IF_INACCESSIBLEMEMONLY`: the `:effect-free`-ness of this method can later be refined to `ALWAYS_TRUE` in a case when `:inaccessiblememonly` is proven. +- `reset_safe::UInt8` + * The execution of this function may be interrupted and reset to an earlier cancellation + point at any point in the function. The interpretation is similar to `:effect_free`, + but has different guarantees. - `nothrow::Bool`: this method is guaranteed to not throw an exception. If the execution of this method may raise `MethodError`s and similar exceptions, then the method is not considered as `:nothrow`. @@ -119,6 +127,7 @@ $(effects_key_string) struct Effects consistent::UInt8 effect_free::UInt8 + reset_safe::UInt8 nothrow::Bool terminates::Bool notaskstate::Bool @@ -129,6 +138,7 @@ struct Effects function Effects( consistent::UInt8, effect_free::UInt8, + reset_safe::UInt8, nothrow::Bool, terminates::Bool, notaskstate::Bool, @@ -139,6 +149,7 @@ struct Effects return new( consistent, effect_free, + reset_safe, nothrow, terminates, notaskstate, @@ -175,14 +186,18 @@ const NOUB_IF_NOINBOUNDS = 0x01 << 1 # :nonoverlayed bits const CONSISTENT_OVERLAY = 0x01 << 1 -const EFFECTS_TOTAL = Effects(ALWAYS_TRUE, ALWAYS_TRUE, true, true, true, ALWAYS_TRUE, ALWAYS_TRUE, ALWAYS_TRUE, true) -const EFFECTS_THROWS = Effects(ALWAYS_TRUE, ALWAYS_TRUE, false, true, true, ALWAYS_TRUE, ALWAYS_TRUE, ALWAYS_TRUE, true) -const EFFECTS_UNKNOWN = Effects(ALWAYS_FALSE, ALWAYS_FALSE, false, false, false, ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_TRUE, false) # unknown mostly, but it's not overlayed at least (e.g. it's not a call) +# :reset_safe bits +const RESET_SAFE_IF_INACCESSIBLEMEMONLY = 0x01 << 1 -function Effects(effects::Effects=Effects( - ALWAYS_FALSE, ALWAYS_FALSE, false, false, false, ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_FALSE, false); +const EFFECTS_TOTAL = Effects(ALWAYS_TRUE, ALWAYS_TRUE, ALWAYS_TRUE, true, true, true, ALWAYS_TRUE, ALWAYS_TRUE, ALWAYS_TRUE, true) +const EFFECTS_THROWS = Effects(ALWAYS_TRUE, ALWAYS_TRUE, ALWAYS_TRUE, false, true, true, ALWAYS_TRUE, ALWAYS_TRUE, ALWAYS_TRUE, true) +const EFFECTS_UNKNOWN = Effects(ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_FALSE, false, false, false, ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_TRUE, false) # unknown mostly, but it's not overlayed at least (e.g. it's not a call) +const EFFECTS_MINIMAL = Effects(ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_FALSE, false, false, false, ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_FALSE, false) + +function Effects(effects::Effects=EFFECTS_MINIMAL; consistent::UInt8 = effects.consistent, effect_free::UInt8 = effects.effect_free, + reset_safe::UInt8 = effects.reset_safe, nothrow::Bool = effects.nothrow, terminates::Bool = effects.terminates, notaskstate::Bool = effects.notaskstate, @@ -193,6 +208,7 @@ function Effects(effects::Effects=Effects( return Effects( consistent, effect_free, + reset_safe, nothrow, terminates, notaskstate, @@ -225,6 +241,14 @@ function is_better_effects(new::Effects, old::Effects) elseif new.effect_free != old.effect_free return false end + if new.reset_safe == ALWAYS_TRUE + any_improved |= old.reset_safe != ALWAYS_TRUE + elseif new.reset_safe == RESET_SAFE_IF_INACCESSIBLEMEMONLY + old.reset_safe == ALWAYS_TRUE && return false + any_improved |= old.reset_safe != RESET_SAFE_IF_INACCESSIBLEMEMONLY + elseif new.reset_safe != old.reset_safe + return false + end if new.nothrow any_improved |= !old.nothrow elseif new.nothrow != old.nothrow @@ -276,6 +300,7 @@ function merge_effects(old::Effects, new::Effects) return Effects( merge_effectbits(old.consistent, new.consistent), merge_effectbits(old.effect_free, new.effect_free), + merge_effectbits(old.reset_safe, new.reset_safe), merge_effectbits(old.nothrow, new.nothrow), merge_effectbits(old.terminates, new.terminates), merge_effectbits(old.notaskstate, new.notaskstate), @@ -345,13 +370,15 @@ function encode_effects(e::Effects) ((e.inaccessiblememonly % UInt32) << 8) | ((e.noub % UInt32) << 10) | ((e.nonoverlayed % UInt32) << 12) | - ((e.nortcall % UInt32) << 14) + ((e.nortcall % UInt32) << 14) | + ((e.reset_safe % UInt32) << 15) end function decode_effects(e::UInt32) return Effects( UInt8((e >> 0) & 0x07), UInt8((e >> 3) & 0x03), + UInt8((e >> 15) & 0x03), Bool((e >> 5) & 0x01), Bool((e >> 6) & 0x01), Bool((e >> 7) & 0x01), diff --git a/Compiler/src/ssair/show.jl b/Compiler/src/ssair/show.jl index 2947a381be959..e8e732ec73af6 100644 --- a/Compiler/src/ssair/show.jl +++ b/Compiler/src/ssair/show.jl @@ -1062,7 +1062,7 @@ function show_ir(io::IO, compact::IncrementalCompact, config::IRShowConfig=defau finish_show_ir(io, uncompacted_cfg, config) end -function effectbits_letter(effects::Effects, name::Symbol, suffix::Char) +function effectbits_letter(effects::Effects, name::Symbol, suffix::Union{Char, String}) ft = fieldtype(Effects, name) if ft === UInt8 prefix = getfield(effects, name) === ALWAYS_TRUE ? '+' : @@ -1094,6 +1094,8 @@ function Base.show(io::IO, e::Effects) print(io, ',') printstyled(io, effectbits_letter(e, :effect_free, 'e'); color=effectbits_color(e, :effect_free)) print(io, ',') + printstyled(io, effectbits_letter(e, :reset_safe, "re"); color=effectbits_color(e, :reset_safe)) + print(io, ',') printstyled(io, effectbits_letter(e, :nothrow, 'n'); color=effectbits_color(e, :nothrow)) print(io, ',') printstyled(io, effectbits_letter(e, :terminates, 't'); color=effectbits_color(e, :terminates)) diff --git a/base/asyncevent.jl b/base/asyncevent.jl index fa9bb520cc855..6ac8e79b63ff8 100644 --- a/base/asyncevent.jl +++ b/base/asyncevent.jl @@ -183,7 +183,7 @@ function _trywait(t::Union{Timer, AsyncCondition}) set = t.set if !set && t.handle != C_NULL # wait for set or handle, but not the isopen flag iolock_end() - set = wait(t.cond) + set = wait(t.cond; waitee=t) unlock(t.cond) iolock_begin() lock(t.cond) @@ -192,7 +192,6 @@ function _trywait(t::Union{Timer, AsyncCondition}) unlock(t.cond) unpreserve_handle(t) end - @cancel_check() end iolock_end() end @@ -200,8 +199,13 @@ function _trywait(t::Union{Timer, AsyncCondition}) return set end +cancel_wait!(t::Union{Timer, AsyncCondition}, task::Task) = + cancel_wait!(t.cond, task, false; waitee=t) + function wait(t::Union{Timer, AsyncCondition}) - _trywait(t) || throw(EOFError()) + ok = _trywait(t) + @cancel_check + ok || throw(EOFError()) nothing end diff --git a/base/condition.jl b/base/condition.jl index fff9943966268..b4aead93faba3 100644 --- a/base/condition.jl +++ b/base/condition.jl @@ -82,13 +82,13 @@ islocked(c::GenericCondition) = islocked(c.lock) lock(f, c::GenericCondition) = lock(f, c.lock) # have waiter wait for c -function _wait2(c::GenericCondition, waiter::Task, first::Bool=false) +function _wait2(c::GenericCondition, waiter::Task, waitee=c, first::Bool=false) ct = current_task() assert_havelock(c) if first - pushfirst!(waitqueue(c), waiter) + pushfirst!(ILLRef(waitqueue(c), waitee), waiter) else - push!(waitqueue(c), waiter) + push!(ILLRef(waitqueue(c), waitee), waiter) end # since _wait2 is similar to schedule, we should observe the sticky bit now if waiter.sticky && Threads.threadid(waiter) == 0 && !GC.in_finalizer() @@ -130,9 +130,7 @@ function wait end macro cancel_check() quote local req = Core.cancellation_point!() - if req !== nothing - throw(conform_cancellation_request(req)) - end + req !== nothing && handle_cancellation!(req) end end @@ -144,9 +142,9 @@ Wait for [`notify`](@ref) on `c` and return the `val` parameter passed to `notif If the keyword `first` is set to `true`, the waiter will be put _first_ in line to wake up on `notify`. Otherwise, `wait` has first-in-first-out (FIFO) behavior. """ -function wait(c::GenericCondition; first::Bool=false, cancel_check::Bool=true) +function wait(c::GenericCondition; first::Bool=false, waitee=c) ct = current_task() - _wait2(c, ct, first) + _wait2(c, ct, waitee, first) token = unlockall(c.lock) try return wait() @@ -156,18 +154,30 @@ function wait(c::GenericCondition; first::Bool=false, cancel_check::Bool=true) finally relockall(c.lock, token) end - cancel_check && @cancel_check() end -function cancel_wait!(c::GenericCondition, t::Task) +function cancel_wait!(c::GenericCondition, t::Task; waitee = c) @assert (@atomic :monotonic t.cancellation_request) !== nothing lock(c) - if t.queue !== c + if t.queue !== waitee unlock(c) return false end - Base.list_deletefirst!(waitqueue(c), t) - enq_work(t) + Base.list_deletefirst!(ILLRef(waitqueue(c), waitee), t) + schedule(t, conform_cancellation_request(t.cancellation_request), error=true) + unlock(c) + return true +end + +function cancel_wait!(c::GenericCondition, t::Task, @nospecialize(val); waitee=c) + @assert (@atomic :monotonic t.cancellation_request) !== nothing + lock(c) + if t.queue !== waitee + unlock(c) + return false + end + Base.list_deletefirst!(ILLRef(waitqueue(c), waitee), t) + schedule(t, val) unlock(c) return true end diff --git a/base/experimental.jl b/base/experimental.jl index 2deb3bc76af6c..f722baf16dc21 100644 --- a/base/experimental.jl +++ b/base/experimental.jl @@ -646,7 +646,7 @@ millisecond. """ function wait_with_timeout(c::GenericCondition; first::Bool=false, timeout::Real=0.0) ct = current_task() - Base._wait2(c, ct, first) + Base._wait2(c, ct, c, first) token = Base.unlockall(c.lock) timer::Union{Timer, Nothing} = nothing diff --git a/base/linked_list.jl b/base/linked_list.jl index cf3be4115f73d..9df72622e7957 100644 --- a/base/linked_list.jl +++ b/base/linked_list.jl @@ -9,8 +9,9 @@ end struct ILLRef{T} list::IntrusiveLinkedList{T} - waitee::Any # Invariant: waitqueue(waitee) === list + waitee::Any # Invariant: waitqueue(waitee).list === list end +ILLRef(ref::ILLRef, @nospecialize(waitee)) = typeof(ref)(ref.list, waitee) waitqueue(list::IntrusiveLinkedList{T}) where {T} = ILLRef(list, list) #const list_append!! = append! @@ -128,6 +129,17 @@ function list_deletefirst!(qr::ILLRef{T}, val::T) where T return q end +function in(val::T, list::IntrusiveLinkedList{T}) where T + head = list.head + while head !== nothing + if val === head + return true + end + head = head.next + end + return false +end + # TODO: Delete this compatibility wrapper list_deletefirst!(q::IntrusiveLinkedList{T}, val::T) where T = list_deletefirst!(ILLRef(q, q), val) push!(q::IntrusiveLinkedList{T}, val::T) where T = push!(ILLRef(q, q), val) diff --git a/base/task.jl b/base/task.jl index 077d571a5fb6f..ae1f9f4b3ff74 100644 --- a/base/task.jl +++ b/base/task.jl @@ -145,9 +145,11 @@ end # task states -const task_state_runnable = UInt8(0) -const task_state_done = UInt8(1) -const task_state_failed = UInt8(2) +const task_state_runnable = UInt8(0) +const task_state_done = UInt8(1) +const task_state_failed = UInt8(2) +# like _failed, but allows schedule to succeed +const task_state_cancelled = UInt8(3) @inline function getproperty(t::Task, field::Symbol) if field === :state @@ -159,6 +161,8 @@ const task_state_failed = UInt8(2) return :done elseif st === task_state_failed return :failed + elseif st === task_state_cancelled + return :cancelled else @assert false end @@ -250,7 +254,10 @@ true !!! compat "Julia 1.3" This function requires at least Julia 1.3. """ -istaskfailed(t::Task) = ((@atomic :acquire t._state) === task_state_failed) +function istaskfailed(t::Task) + state = (@atomic :acquire t._state) + return state === task_state_failed || state === task_state_cancelled +end Threads.threadid(t::Task) = Int(ccall(:jl_get_task_tid, Int16, (Any,), t)+1) function Threads.threadpool(t::Task) @@ -309,7 +316,7 @@ function _wait(t::Task; expected_cancellation = nothing) lock(donenotify) try while !istaskdone(t) && cancellation_request() === expected_cancellation - wait(donenotify; cancel_check=false) + wait(donenotify; waitee=t) end finally unlock(donenotify) @@ -317,6 +324,7 @@ function _wait(t::Task; expected_cancellation = nothing) end nothing end +cancel_wait!(waitee::Task, waiter::Task) = cancel_wait!(waitee.donenotify, waiter, nothing; waitee) # have `waiter` wait for `t` function _wait2(t::Task, waiter::Task) @@ -973,7 +981,17 @@ const Workqueue = Workqueues[1] # default work queue is thread 1 // TODO: deprec workqueue_for(tid::Int) = Workqueues[tid] function enq_work(t::Task) - (t._state === task_state_runnable && t.queue === nothing) || error("schedule: Task not runnable") + state = t._state + if state === task_state_cancelled + # When canelled, we allow `enq_work`, but simply transition to failed state. + # All other task cleanup is already done. + state = (@atomicreplace t._state task_state_cancelled => task_state_failed).old + # Catch double `schedule` calls on cancelled tasks. + state === task_state_cancelled && return + end + if !(state === task_state_runnable && t.queue === nothing) + error("schedule: Task not runnable") + end _enq_work(t) end @@ -1066,18 +1084,19 @@ true """ function schedule(t::Task, @nospecialize(arg); error=false) # schedule a task to be (re)started with the given value or exception - t._state === task_state_runnable || Base.error("schedule: Task not runnable") - if error - q = t.queue; q === nothing || list_deletefirst!(q::IntrusiveLinkedList{Task}, t) - setfield!(t, :result, arg) - setfield!(t, :_isexception, true) - else - t.queue === nothing || Base.error("schedule: Task not runnable") - setfield!(t, :result, arg) + state = t._state + if t._state === task_state_runnable + if error + q = t.queue; q === nothing || list_deletefirst!(q::IntrusiveLinkedList{Task}, t) + setfield!(t, :result, arg) + setfield!(t, :_isexception, true) + else + t.queue === nothing || Base.error("schedule: Task not runnable") + setfield!(t, :result, arg) + end end # [task] created -scheduled-> wait_time - maybe_record_enqueued!(t) - enq_work(t) + schedule(t) return t end @@ -1365,6 +1384,17 @@ internal tasks. Such tasks will be frozen and become unschedulable in the future """ const CANCEL_REQUEST_ABANDON_ALL = CancellationRequest(0x4) +""" + CANCEL_REQUEST_YIELD + +Request that the task yield to the scheduler at the next cancellation point to +allow another task to run its cancellation propagation logic. The cancelled task +itself will reset to ordinary operation before yielding, but may of course be +canceled by said other task before it resumes operation. +""" +const CANCEL_REQUEST_YIELD = CancellationRequest(0x5) + + function Base.showerror(io::IO, cr::CancellationRequest) print(io, "CancellationRequest: ") if cr === CANCEL_REQUEST_SAFE @@ -1381,6 +1411,18 @@ function conform_cancellation_request(@nospecialize(cr)) return cr end +# This is the slow path of @cancel_check +@noinline function handle_cancellation!(@nospecialize(_req)) + req = conform_cancellation_request(_req) + if req === CANCEL_REQUEST_YIELD + @atomicreplace :sequentially_consistent :monotonic current_task().cancellation_request _req => nothing + yield() + req = cancellation_request() + end + req === nothing && return + throw(req) +end + """ cancellation_request() @@ -1407,12 +1449,55 @@ when a cancellation is requested. Core.cancellation_point! function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE) + # TODO: Raise task priority @atomic :release t.cancellation_request = crequest + # TODO: SYS_membarrier() ? + # Special case: If the task hasn't started yet at this point, we want to set + # it up to cancel any waits, but we need to be a bit careful with concurrent + # starts of the task. + if !istaskstarted(t) + t.result = crequest + t._isexception = true + if (@atomicreplace :sequentially_consistent :monotonic t._state task_state_runnable => task_state_cancelled).success + lock(t.donenotify) + notify(t.donenotify) + unlock(t.donenotify) + end + return + end while !istaskdone(t) waitee = t.queue waitee === nothing && (yield(); continue) invokelatest(cancel_wait!, waitee, t) && break end + if t.sticky + # If this task is sticky, it won't be able to run if the task currently + # running on its thread is blocking. Use the cancellation mechanism to + # try and pre-empt that task. + # N.B.: This is a best-effort attempt; the task we end up with may get + # descheduled before we get around to cancelling it. However, that's + # fine - it's not a correctness issue to deschedule the task. The + # important thing is that the thread re-enter the scheduler to pick up + # our cancelled task. + # In the future, we may want to use the same mechanism for more general + # pre-emption, but this helps avoid situations where tasks that have + # cancellation points, but no yield points become uncancellable. + tid = Threads.threadid(t) + if tid != 0 + ccall(:jl_preempt_thread_task, Cvoid, (Int16,), (tid - 1) % Int16) + end + end +end + +function cancel_wait!(q::StickyWorkqueue, t::Task) + # Tasks in the workqueue are runnable - we do not cancel the wait, + # but we do need to check whether it's in there + lock(q.lock) + try + return (t in q.queue) + finally + unlock(q.lock) + end end """ diff --git a/src/codegen.cpp b/src/codegen.cpp index c193da2c436d6..eb3a7912ee081 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -4969,7 +4969,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f, // 2. Check if cr == NULL || cr == jl_nothing Value *is_null = ctx.builder.CreateIsNull(cr_relaxed); - Value *nothing_val = literal_pointer_val(ctx, jl_nothing); + Value *nothing_val = track_pjlvalue(ctx, literal_pointer_val(ctx, jl_nothing)); Value *is_nothing = ctx.builder.CreateICmpEQ(decay_derived(ctx, cr_relaxed), decay_derived(ctx, nothing_val)); Value *no_cancel = ctx.builder.CreateOr(is_null, is_nothing); diff --git a/src/signal-handling.c b/src/signal-handling.c index 39eb3ceedd53b..f555bd475755d 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -453,8 +453,13 @@ JL_DLLEXPORT void jl_set_sigint_cond(uv_async_t *cond) static void deliver_sigint_notification(void) { if (JL_TRYLOCK_NOGC(&sigint_cond_lock)) { - if (sigint_cond_loc != NULL) + if (sigint_cond_loc != NULL) { uv_async_send(sigint_cond_loc); + // IO only runs on one thread, which may currently be busy - try + // to preempt it, so that the IO loop has a chance to run and deliver + // this notification. + jl_preempt_thread_task(jl_atomic_load_relaxed(&io_loop_tid)); + } JL_UNLOCK_NOGC(&sigint_cond_lock); } } diff --git a/src/task.c b/src/task.c index 68ede8ff55e90..dbf98d3450a6e 100644 --- a/src/task.c +++ b/src/task.c @@ -1210,6 +1210,17 @@ void jl_init_tasks(void) JL_GC_DISABLED static void NOINLINE JL_NORETURN _start_task(void); #endif +static void NOINLINE _handle_start_task_cancellation(jl_value_t *creq) +{ + jl_value_t *cancel_handler = jl_get_global(jl_base_module, jl_symbol("handle_cancellation!")); + if (!cancel_handler) { + jl_safe_printf("Task cancellation requested but Base.handle_cancellation! is not defined\n"); + jl_exit(1); + } + jl_value_t *fargs[2] = { cancel_handler, creq }; + jl_apply(fargs, 2); +} + static void NOINLINE JL_NORETURN JL_NO_ASAN start_task(void) { CFI_NORETURN @@ -1270,6 +1281,12 @@ CFI_NORETURN jl_sigint_safepoint(ptls); } JL_TIMING(ROOT, ROOT); + for (;;) { + jl_value_t *creq = jl_atomic_load_relaxed(&ct->cancellation_request); + if (creq == jl_nothing) + break; + _handle_start_task_cancellation(creq); + } res = jl_apply(&ct->start, 1); } JL_CATCH { @@ -1663,6 +1680,13 @@ JL_DLLEXPORT int8_t jl_get_task_threadpoolid(jl_task_t *t) return t->threadpoolid; } +JL_DLLEXPORT void jl_preempt_thread_task(int16_t tid) +{ + jl_task_t *task = jl_atomic_load_relaxed(&jl_all_tls_states[tid]->current_task); + jl_value_t *expected = jl_nothing; + // If the task is already being cancelled, that's good enough for preemption + jl_atomic_cmpswap(&task->cancellation_request, &expected, jl_box_uint8(0x5)); +} #ifdef _OS_WINDOWS_ #if defined(_CPU_X86_) diff --git a/test/cancellation.jl b/test/cancellation.jl new file mode 100644 index 0000000000000..baa9f6e7afa46 --- /dev/null +++ b/test/cancellation.jl @@ -0,0 +1,78 @@ +const collatz_code = quote + collatz(n) = (n & 1) == 1 ? (3n + 1) : (nรท2) + function find_collatz_counterexample() + i = 1 + while true + j = i + while true + @Base.cancel_check + j = collatz(j) + j == 1 && break + j == i && error("$j is a collatz counterexample") + end + i += 1 + end + end + function find_collatz_counterexample_inner() + i = 1 + while true + j = i + while true + j = collatz(j) + j == 1 && break + j == i && return j + end + i += 1 + end + end + function find_collatz_counterexample2() + @Base.cancel_check + return find_collatz_counterexample_inner() + end +end +eval(collatz_code) + +@testset "cancellation" begin + # Test cancellation point for a task that has never started running + @test_throws Base.CancellationRequest wait(@task nothing) + + + # Simple cancellation of `sleep` + t = @async sleep(10000) + yield(); yield(); yield() # Give the task a chance to start + Base.cancel!(t) + @test_throws Base.CancellationRequest wait(t) + + # Simple cancellation of compute-bound function + t = @Threads.spawn(find_collatz_counterexample()) + yield(); yield(); yield() # Give the task a chance to start + Base.cancel!(t) + @test_throws Base.CancellationRequest wait(t) + + # Test cancellation of @sync + t = @async @sync begin + @async sleep(10000) + @async sleep(10000) + end + yield(); yield(); yield() # Give the task a chance to start + Base.cancel!(t) + @test_throws Base.CompositeException wait(t) +end + +@testset "^C" begin + # This exercises ^C needing to preempt a compute-bound task + test_code = :(try; + @sync ((@async sleep(10000)); @async find_collatz_counterexample()) + catch e + println(typeof(e)) + end) + @test read(`$(Base.julia_cmd()) -e '$(string(collatz_code)); $test_code'`, String) == "CompositeException" + + # Make sure that preemption doesn't cause problems if all tasks are blocked + test_code = :(try; + @sync ((@async sleep(10000)); @async sleep(10000)) + catch e + println(typeof(e)) + end) + @test read(`$(Base.julia_cmd()) -e '$(string(collatz_code)); $test_code'`, String) == "CompositeException" +end From e8ea4a7a5e94d59de9be7a4d1c073e415a3cef7e Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Thu, 4 Dec 2025 01:03:24 +0000 Subject: [PATCH 5/9] Claude's compiler support WIP --- AGENTS.md | 14 + Compiler/src/effects.jl | 3 + Compiler/src/optimize.jl | 14 +- base/task.jl | 29 +- src/Makefile | 1 + src/codegen.cpp | 29 ++ src/julia.h | 2 + src/julia_internal.h | 1 + src/llvm-cancellation-lowering.cpp | 311 ++++++++++++++++++ src/llvm-julia-passes.inc | 1 + src/llvm-late-gc-lowering.cpp | 2 +- src/passes.h | 5 + src/pipeline.cpp | 1 + src/signals-unix.c | 30 ++ src/signals-win.c | 9 + test/cancellation.jl | 2 +- .../cancellation-lowering-codegen.jl | 26 ++ test/llvmpasses/cancellation-lowering.ll | 82 +++++ 18 files changed, 546 insertions(+), 16 deletions(-) create mode 100644 src/llvm-cancellation-lowering.cpp create mode 100644 test/llvmpasses/cancellation-lowering-codegen.jl create mode 100644 test/llvmpasses/cancellation-lowering.ll diff --git a/AGENTS.md b/AGENTS.md index d81defa16f136..f4f934eec9b4b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -27,6 +27,20 @@ If you made changes to the runtime (any files in `src/`), you will need to rebui julia. Run `make -j` to rebuild julia. This process may take up to 10 minutes depending on your changes. +### Testing LLVM-related changes + +When making changes to LLVM passes or codegen, add `LLVM_ASSERTIONS=1` to `Make.user` to enable +LLVM assertions. This helps catch IR verification errors early: + +```bash +echo "LLVM_ASSERTIONS=1" >> Make.user +``` + +To run LLVM pass tests: +```bash +make -C test/llvmpasses .ll +``` + After making changes, run static analysis checks: - First run `make -C src install-analysis-deps` to initialize dependencies (only needed once the first time). - Run `make -C src analyze- --output-sync -j8` (replace `` with the basename of any C or C++ file you modified, excluding headers). diff --git a/Compiler/src/effects.jl b/Compiler/src/effects.jl index 4879a9981524a..915f956d824a2 100644 --- a/Compiler/src/effects.jl +++ b/Compiler/src/effects.jl @@ -320,6 +320,7 @@ merge_effectbits(old::Bool, new::Bool) = old & new is_consistent(effects::Effects) = effects.consistent === ALWAYS_TRUE is_effect_free(effects::Effects) = effects.effect_free === ALWAYS_TRUE +is_reset_safe(effects::Effects) = effects.reset_safe === ALWAYS_TRUE is_nothrow(effects::Effects) = effects.nothrow is_terminates(effects::Effects) = effects.terminates is_notaskstate(effects::Effects) = effects.notaskstate @@ -356,6 +357,8 @@ is_consistent_if_inaccessiblememonly(effects::Effects) = !iszero(effects.consist is_effect_free_if_inaccessiblememonly(effects::Effects) = !iszero(effects.effect_free & EFFECT_FREE_IF_INACCESSIBLEMEMONLY) +is_reset_safe_if_inaccessiblememonly(effects::Effects) = !iszero(effects.reset_safe & RESET_SAFE_IF_INACCESSIBLEMEMONLY) + is_inaccessiblemem_or_argmemonly(effects::Effects) = effects.inaccessiblememonly === INACCESSIBLEMEM_OR_ARGMEMONLY is_consistent_overlay(effects::Effects) = effects.nonoverlayed === CONSISTENT_OVERLAY diff --git a/Compiler/src/optimize.jl b/Compiler/src/optimize.jl index b5704c488d273..7d3d9a1dacdef 100644 --- a/Compiler/src/optimize.jl +++ b/Compiler/src/optimize.jl @@ -39,6 +39,9 @@ const IR_FLAG_NOUB = one(UInt32) << 10 #const IR_FLAG_CONSISTENTOVERLAY = one(UInt32) << 12 # This statement is :nortcall const IR_FLAG_NORTCALL = one(UInt32) << 13 +# This statement is proven :reset_safe +const IR_FLAG_RESET_SAFE = one(UInt32) << 14 +# Reserved: one(UInt32) << 15 used for RSIIMO below # An optimization pass has updated this statement in a way that may # have exposed information that inference did not see. Re-running # inference on this statement may be profitable. @@ -50,16 +53,18 @@ const IR_FLAG_UNUSED = one(UInt32) << 17 const IR_FLAG_EFIIMO = one(UInt32) << 18 # This statement is :inaccessiblememonly == INACCESSIBLEMEM_OR_ARGMEMONLY const IR_FLAG_INACCESSIBLEMEM_OR_ARGMEM = one(UInt32) << 19 +# This statement is :reset_safe == RESET_SAFE_IF_INACCESSIBLEMEMONLY +const IR_FLAG_RSIIMO = one(UInt32) << 20 const NUM_IR_FLAGS = 3 # sync with julia.h const IR_FLAGS_EFFECTS = IR_FLAG_CONSISTENT | IR_FLAG_EFFECT_FREE | IR_FLAG_NOTHROW | - IR_FLAG_TERMINATES | IR_FLAG_NOUB | IR_FLAG_NORTCALL + IR_FLAG_TERMINATES | IR_FLAG_NOUB | IR_FLAG_NORTCALL | IR_FLAG_RESET_SAFE const IR_FLAGS_REMOVABLE = IR_FLAG_EFFECT_FREE | IR_FLAG_NOTHROW | IR_FLAG_TERMINATES -const IR_FLAGS_NEEDS_EA = IR_FLAG_EFIIMO | IR_FLAG_INACCESSIBLEMEM_OR_ARGMEM +const IR_FLAGS_NEEDS_EA = IR_FLAG_EFIIMO | IR_FLAG_INACCESSIBLEMEM_OR_ARGMEM | IR_FLAG_RSIIMO has_flag(curr::UInt32, flag::UInt32) = (curr & flag) == flag @@ -79,6 +84,11 @@ function flags_for_effects(effects::Effects) elseif is_effect_free_if_inaccessiblememonly(effects) flags |= IR_FLAG_EFIIMO end + if is_reset_safe(effects) + flags |= IR_FLAG_RESET_SAFE + elseif is_reset_safe_if_inaccessiblememonly(effects) + flags |= IR_FLAG_RSIIMO + end if is_nothrow(effects) flags |= IR_FLAG_NOTHROW end diff --git a/base/task.jl b/base/task.jl index ae1f9f4b3ff74..70b2e258de06f 100644 --- a/base/task.jl +++ b/base/task.jl @@ -1323,7 +1323,7 @@ struct CancellationRequest end """ - CANCEL_REQUEST_SAFE + CANCEL_REQUEST_SAFE Request safe cancelation of the current task. If the task is waiting for any other resources, it will request safe cancellation of any such resources and @@ -1337,7 +1337,7 @@ should be tried first. const CANCEL_REQUEST_SAFE = CancellationRequest(0x0) """ - CANCEL_REQUEST_ACK + CANCEL_REQUEST_ACK Set by the task itself to indicate that a (safe) cancellation request was received and acknowledged, but that there are dependent tasks for whom @@ -1346,20 +1346,20 @@ cancelation is still pending. const CANCEL_REQUEST_ACK = CancellationRequest(0x1) """ - CANCEL_REQUEST_QUERY + CANCEL_REQUEST_QUERY Request that the system create an asynchronous report of why the task is currently not able to be canceled. The report will be provided in the ->cancelation_request field of the current task (as long as this field is still CANCEL_REQUEST_QUERY). N.B.: Transition to CANCEL_REQUEST_QUERY is only allowed from CANCEL_REQUEST_ACK. - Once the waiting task has read the cancelation report, it may set the cancelation - request back to CANCEL_REQUEST_ACK. + Once the waiting task has read the cancelation report, it may set the cancelation + request back to CANCEL_REQUEST_ACK. """ const CANCEL_REQUEST_QUERY = CancellationRequest(0x2) """ - CANCEL_REQUEST_ABANDON_EXTERNAL + CANCEL_REQUEST_ABANDON_EXTERNAL Request a cancelation that will cease waiting for any external resources (e.g. I/O objects) without going through a safe cancelation procedure for such resources. However, the @@ -1371,21 +1371,21 @@ I/O is often engineered for robustness in case of sudden disapperance of peers const CANCEL_REQUEST_ABANDON_EXTERNAL = CancellationRequest(0x3) """ - CANCEL_REQUEST_ABANDON_ALL + CANCEL_REQUEST_ABANDON_ALL Request a cancelation that will cease waiting for all external resources and all unacknowledged internal tasks. Such tasks will be frozen and become unschedulable in the future. !!! warning - If any canceled task has acquired locks or other resources that are contested, this method of - cancelation may leak such resources and create deadlocks in future code. It is intended as a - last-resort method to recover a system, but the necessity of this operation should in general - be considered a bug (e.g. due to insufficient cancellation points in computationally-heavy code). + If any canceled task has acquired locks or other resources that are contested, this method of + cancelation may leak such resources and create deadlocks in future code. It is intended as a + last-resort method to recover a system, but the necessity of this operation should in general + be considered a bug (e.g. due to insufficient cancellation points in computationally-heavy code). """ const CANCEL_REQUEST_ABANDON_ALL = CancellationRequest(0x4) """ - CANCEL_REQUEST_YIELD + CANCEL_REQUEST_YIELD Request that the task yield to the scheduler at the next cancellation point to allow another task to run its cancellation propagation logic. The cancelled task @@ -1465,6 +1465,11 @@ function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE) end return end + # Try to interrupt the task if it's at a cancellation point (has reset_ctx set) + tid = Threads.threadid(t) + if tid != 0 + ccall(:jl_send_cancellation_signal, Cvoid, (Int16,), (tid - 1) % Int16) + end while !istaskdone(t) waitee = t.queue waitee === nothing && (yield(); continue) diff --git a/src/Makefile b/src/Makefile index 832f0b0ae71e4..11d17718785d9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -84,6 +84,7 @@ CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \ llvm-pass-helpers llvm-ptls llvm-propagate-addrspaces \ llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers cgmemmgr llvm-remove-addrspaces \ llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures llvm-expand-atomic-modify \ + llvm-cancellation-lowering \ pipeline llvm_api \ $(GC_CODEGEN_SRCS) FLAGS_COMMON += -I$(shell $(LLVM_CONFIG_HOST) --includedir) diff --git a/src/codegen.cpp b/src/codegen.cpp index eb3a7912ee081..4a019c8412c8e 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1900,6 +1900,7 @@ class jl_codectx_t { int nargs = 0; int nvargs = -1; bool is_opaque_closure = false; + ssize_t current_stmt_idx = -1; // current statement index for ssaflags lookup Value *pgcstack = NULL; Instruction *topalloca = NULL; @@ -2085,6 +2086,28 @@ static Value *emit_ptrgep(jl_codectx_t &ctx, Value *base, size_t byte_offset, co return gep; } +// Check if the current statement has the reset_safe flag set +static bool current_stmt_is_reset_safe(jl_codectx_t &ctx) +{ + if (ctx.current_stmt_idx < 0 || ctx.source == nullptr || ctx.source->ssaflags == nullptr) + return false; + size_t nstmts = jl_array_dim0(ctx.source->ssaflags); + if ((size_t)ctx.current_stmt_idx >= nstmts) + return false; + uint32_t flag = jl_array_data(ctx.source->ssaflags, uint32_t)[ctx.current_stmt_idx]; + return (flag & IR_FLAG_RESET_SAFE) != 0; +} + +// Mark a call instruction with reset_safe metadata if the current statement has the flag +static void mark_reset_safe(jl_codectx_t &ctx, CallInst *call) +{ + if (call && current_stmt_is_reset_safe(ctx)) { + LLVMContext &llvmctx = ctx.builder.getContext(); + MDNode *md = MDNode::get(llvmctx, None); + call->setMetadata("julia.reset_safe", md); + } +} + static Value *emit_ptrgep(jl_codectx_t &ctx, Value *base, Value *byte_offset, const Twine &Name="") { auto *gep = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), base, byte_offset, Name); @@ -5027,6 +5050,8 @@ static CallInst *emit_jlcall(jl_codectx_t &ctx, Value *theFptr, Value *theF, } CallInst *result = ctx.builder.CreateCall(TheTrampoline, theArgs); result->setAttributes(TheTrampoline->getAttributes()); + // Mark as reset_safe if the current statement has that flag + mark_reset_safe(ctx, result); // TODO: we could add readonly attributes in many cases to the args return result; } @@ -5141,6 +5166,8 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos call->setAttributes(returninfo.attrs); if (gcstack_arg && ctx.emission_context.use_swiftcc) call->setCallingConv(CallingConv::Swift); + // Mark as reset_safe if the current statement has that flag + mark_reset_safe(ctx, call); jl_cgval_t retval; switch (returninfo.cc) { @@ -9467,7 +9494,9 @@ static jl_llvm_functions_t } } else { + ctx.current_stmt_idx = cursor; emit_stmtpos(ctx, stmt, cursor); + ctx.current_stmt_idx = -1; mallocVisitStmt(nullptr, have_dbg_update); } find_next_stmt(cursor + 1); diff --git a/src/julia.h b/src/julia.h index 9513dfb47941d..18d549d70d20d 100644 --- a/src/julia.h +++ b/src/julia.h @@ -2324,6 +2324,8 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_value_t*, jl_value_t*, size_t); JL_DLLEXPORT void jl_switchto(jl_task_t **pt); JL_DLLEXPORT int jl_set_task_tid(jl_task_t *task, int16_t tid) JL_NOTSAFEPOINT; JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_preempt_thread_task(int16_t tid); +JL_DLLEXPORT void jl_send_cancellation_signal(int16_t tid); JL_DLLEXPORT void JL_NORETURN jl_throw(jl_value_t *e JL_MAYBE_UNROOTED); JL_DLLEXPORT void JL_NORETURN jl_rethrow(void); JL_DLLEXPORT void JL_NORETURN jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED); diff --git a/src/julia_internal.h b/src/julia_internal.h index 9a85d1086d694..c2e6aa7634983 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -2087,6 +2087,7 @@ JL_DLLEXPORT uint32_t jl_crc32c(uint32_t crc, const char *buf, size_t len); // -- exports from codegen -- // #define IR_FLAG_INBOUNDS 0x01 +#define IR_FLAG_RESET_SAFE (1 << 14) JL_DLLIMPORT void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec); JL_DLLIMPORT int jl_compile_codeinst(jl_code_instance_t *unspec); diff --git a/src/llvm-cancellation-lowering.cpp b/src/llvm-cancellation-lowering.cpp new file mode 100644 index 0000000000000..9d589f59045eb --- /dev/null +++ b/src/llvm-cancellation-lowering.cpp @@ -0,0 +1,311 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +// This pass lowers the julia.cancellation_point intrinsic to: +// 1. A stack buffer allocation for the jl_ucontext_t +// 2. A setjmp call on the uc_mcontext field +// 3. Assignment of the buffer address to task->reset_ctx (atomic release) +// +// It also walks the function to find stores/calls without julia.reset_safe +// metadata and inserts reset_ctx = NULL before them. + +#include "llvm-version.h" +#include "passes.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llvm-codegen-shared.h" +#include "llvm-pass-helpers.h" +#include "julia.h" +#include "julia_internal.h" +#include "julia_threads.h" + +#define DEBUG_TYPE "cancellation_lowering" + +STATISTIC(CancellationPointsLowered, "Number of cancellation points lowered"); +STATISTIC(ResetCtxClearsInserted, "Number of reset_ctx clears inserted"); + +using namespace llvm; + +// Check if an instruction has the julia.reset_safe metadata +static bool hasResetSafeMetadata(Instruction *I) { + return I->getMetadata("julia.reset_safe") != nullptr; +} + +struct CancellationLowering { + Function *cancel_point_func; + Value *pgcstack; + Value *reset_ctx_ptr; // Computed once in entry block, dominates all uses + + CancellationLowering(Module &M) : cancel_point_func(nullptr), pgcstack(nullptr), reset_ctx_ptr(nullptr) { + cancel_point_func = M.getFunction("julia.cancellation_point"); + } + + bool runOnFunction(Function &F); + +private: + // Compute reset_ctx_ptr once in entry block + // If insertAfter is provided, insert after that instruction + // Otherwise insert at the beginning of the entry block (after allocas) + void computeResetCtxPtr(Function &F, Instruction *insertAfter); +}; + +void CancellationLowering::computeResetCtxPtr(Function &F, Instruction *insertAfter) { + if (!pgcstack) + return; + + LLVMContext &LLVMCtx = F.getContext(); + Type *I8Ty = Type::getInt8Ty(LLVMCtx); + Type *I64Ty = Type::getInt64Ty(LLVMCtx); + + IRBuilder<> Builder(LLVMCtx); + if (insertAfter) { + // Insert right after pgcstack call + Builder.SetInsertPoint(insertAfter->getNextNode()); + } else { + // pgcstack is an argument, insert at the start of entry block + // but after any allocas + BasicBlock &entry = F.getEntryBlock(); + BasicBlock::iterator insertPt = entry.begin(); + while (insertPt != entry.end() && isa(&*insertPt)) { + ++insertPt; + } + Builder.SetInsertPoint(&entry, insertPt); + } + + // Get the offset of gcstack in jl_task_t + size_t gcstack_offset = offsetof(jl_task_t, gcstack); + + Value *task_ptr = Builder.CreateGEP(I8Ty, pgcstack, + ConstantInt::get(I64Ty, -(int64_t)gcstack_offset), + "current_task"); + + // Get pointer to reset_ctx field in current task + size_t reset_ctx_offset = offsetof(jl_task_t, reset_ctx); + + reset_ctx_ptr = Builder.CreateGEP(I8Ty, task_ptr, + ConstantInt::get(I64Ty, reset_ctx_offset), + "reset_ctx_ptr"); +} + +bool CancellationLowering::runOnFunction(Function &F) { + // Skip if there's no cancellation_point function in the module + if (!cancel_point_func) + return false; + + bool Changed = false; + + // Find pgcstack - either as a call to julia.get_pgcstack or as an argument with "gcstack" attribute + pgcstack = nullptr; + reset_ctx_ptr = nullptr; + Instruction *pgcstack_inst = nullptr; // Only set if pgcstack is from a call, not an argument + Function *pgcstack_getter = F.getParent()->getFunction("julia.get_pgcstack"); + Function *adoptthread_func = F.getParent()->getFunction("julia.get_pgcstack_or_new"); + if (pgcstack_getter || adoptthread_func) { + for (auto &I : F.getEntryBlock()) { + if (CallInst *callInst = dyn_cast(&I)) { + Value *callee = callInst->getCalledOperand(); + if ((pgcstack_getter && callee == pgcstack_getter) || + (adoptthread_func && callee == adoptthread_func)) { + pgcstack = callInst; + pgcstack_inst = callInst; + break; + } + } + } + } + // If not found via call, check for argument with "gcstack" attribute + if (!pgcstack) { + for (auto &arg : F.args()) { + AttributeSet attrs = F.getAttributes().getParamAttrs(arg.getArgNo()); + if (attrs.hasAttribute("gcstack")) { + pgcstack = &arg; + break; + } + } + } + + // First, find all cancellation_point intrinsics + SmallVector CancellationPoints; + + for (auto &BB : F) { + for (auto &I : BB) { + if (auto *CI = dyn_cast(&I)) { + Value *callee = CI->getCalledOperand(); + if (callee && callee == cancel_point_func) { + CancellationPoints.push_back(CI); + } + } + } + } + + if (CancellationPoints.empty()) { + return false; + } + + // Compute reset_ctx_ptr once in entry block (dominates all uses) + // pgcstack_inst is set when pgcstack comes from a call; null when it's an argument + computeResetCtxPtr(F, pgcstack_inst); + + // Lower each cancellation point + for (CallInst *CI : CancellationPoints) { + ++CancellationPointsLowered; + Changed = true; + + IRBuilder<> Builder(CI); + LLVMContext &LLVMCtx = F.getContext(); + Type *I8Ty = Type::getInt8Ty(LLVMCtx); + Type *I32Ty = Type::getInt32Ty(LLVMCtx); + Type *PtrTy = PointerType::getUnqual(LLVMCtx); + + if (!reset_ctx_ptr) { + // Can't lower without access to the task, just remove the intrinsic + CI->replaceAllUsesWith(ConstantInt::get(I32Ty, 0)); + CI->eraseFromParent(); + continue; + } + + // Allocate a _jl_ucontext_t on the stack + const size_t UContextSize = sizeof(_jl_ucontext_t); + const size_t UContextAlign = alignof(_jl_ucontext_t); + + // Create the alloca at the start of the function + IRBuilder<> AllocaBuilder(&F.getEntryBlock().front()); + Type *UContextTy = ArrayType::get(I8Ty, UContextSize); + AllocaInst *UContextBuf = AllocaBuilder.CreateAlloca(UContextTy, nullptr, "cancel_ucontext"); + UContextBuf->setAlignment(Align(UContextAlign)); + + // Store the ucontext address to reset_ctx with atomic release ordering + StoreInst *store = Builder.CreateAlignedStore(UContextBuf, reset_ctx_ptr, Align(sizeof(void*))); + store->setOrdering(AtomicOrdering::Release); + + // Call setjmp on the uc_mcontext field (which is at offset 0 of the struct) + // Use the platform-specific setjmp function name defined in julia.h + FunctionType *SetjmpTy = FunctionType::get(I32Ty, {PtrTy}, false); + FunctionCallee SetjmpFn = F.getParent()->getOrInsertFunction(jl_setjmp_name, SetjmpTy); + + CallInst *SetjmpCall = Builder.CreateCall(SetjmpFn, {UContextBuf}); + SetjmpCall->addFnAttr(Attribute::ReturnsTwice); + + // Replace uses and remove the intrinsic + CI->replaceAllUsesWith(SetjmpCall); + CI->eraseFromParent(); + } + + // Now walk the function to find stores/calls without reset_safe metadata + // and insert reset_ctx = NULL before them + SmallVector UnsafePoints; + + // We need to skip instructions that are part of our setup (pgcstack, task, reset_ctx_ptr) + // since they occur before reset_ctx_ptr is available + Instruction *reset_ctx_ptr_inst = dyn_cast_or_null(reset_ctx_ptr); + + for (auto &BB : F) { + bool past_setup = (&BB != &F.getEntryBlock()); + for (auto &I : BB) { + // In the entry block, skip instructions until after reset_ctx_ptr is defined + if (!past_setup) { + if (&I == reset_ctx_ptr_inst) { + past_setup = true; + } + continue; + } + + // Check for stores (but skip the reset_ctx stores we just created) + if (auto *SI = dyn_cast(&I)) { + if (!hasResetSafeMetadata(SI)) { + // Skip atomic stores (including reset_ctx stores we just created) + if (SI->isAtomic()) + continue; + UnsafePoints.push_back(SI); + } + } + // Check for calls (but not debug intrinsics, lifetime markers, etc.) + else if (auto *CI = dyn_cast(&I)) { + // Skip debug intrinsics and other harmless intrinsics + if (isa(CI)) + continue; + if (CI->isLifetimeStartOrEnd()) + continue; + + // Check for reset_safe metadata + if (!hasResetSafeMetadata(CI)) { + // Also skip intrinsic calls that are known safe + Function *Callee = CI->getCalledFunction(); + if (Callee && Callee->isIntrinsic()) { + Intrinsic::ID ID = Callee->getIntrinsicID(); + if (ID == Intrinsic::lifetime_start || + ID == Intrinsic::lifetime_end || + ID == Intrinsic::dbg_declare || + ID == Intrinsic::dbg_value || + ID == Intrinsic::dbg_label || + ID == Intrinsic::assume || + ID == Intrinsic::expect || + ID == Intrinsic::prefetch) { + continue; + } + } + // Skip the setjmp calls we just created + if (Callee && Callee->getName() == jl_setjmp_name) + continue; + UnsafePoints.push_back(CI); + } + } + } + } + + // Insert reset_ctx = NULL before each unsafe point + for (Instruction *I : UnsafePoints) { + if (!reset_ctx_ptr) + continue; + + ++ResetCtxClearsInserted; + Changed = true; + + IRBuilder<> Builder(I); + LLVMContext &LLVMCtx = F.getContext(); + Type *PtrTy = PointerType::getUnqual(LLVMCtx); + + // Store NULL to reset_ctx with atomic release ordering + Value *null_ptr = ConstantPointerNull::get(cast(PtrTy)); + StoreInst *store = Builder.CreateAlignedStore(null_ptr, reset_ctx_ptr, Align(sizeof(void*))); + store->setOrdering(AtomicOrdering::Release); + } + + // Insert reset_ctx = NULL before all return instructions + // This is necessary because the cancel_ucontext buffer is stack-allocated, + // and becomes invalid when the function returns + if (reset_ctx_ptr) { + LLVMContext &LLVMCtx = F.getContext(); + Type *PtrTy = PointerType::getUnqual(LLVMCtx); + Value *null_ptr = ConstantPointerNull::get(cast(PtrTy)); + + for (auto &BB : F) { + if (auto *RI = dyn_cast(BB.getTerminator())) { + IRBuilder<> Builder(RI); + StoreInst *store = Builder.CreateAlignedStore(null_ptr, reset_ctx_ptr, Align(sizeof(void*))); + store->setOrdering(AtomicOrdering::Release); + ++ResetCtxClearsInserted; + } + } + } + + return Changed; +} + +PreservedAnalyses CancellationLoweringPass::run(Function &F, FunctionAnalysisManager &AM) { + CancellationLowering CL(*F.getParent()); + if (CL.runOnFunction(F)) { +#ifdef JL_VERIFY_PASSES + assert(!verifyLLVMIR(F)); +#endif + return PreservedAnalyses::allInSet(); + } + return PreservedAnalyses::all(); +} diff --git a/src/llvm-julia-passes.inc b/src/llvm-julia-passes.inc index bd223499f37af..6f860ead8af84 100644 --- a/src/llvm-julia-passes.inc +++ b/src/llvm-julia-passes.inc @@ -16,6 +16,7 @@ FUNCTION_PASS("AllocOpt", AllocOptPass()) FUNCTION_PASS("PropagateJuliaAddrspaces", PropagateJuliaAddrspacesPass()) FUNCTION_PASS("GCInvariantVerifier", GCInvariantVerifierPass()) FUNCTION_PASS("FinalLowerGC", FinalLowerGCPass()) +FUNCTION_PASS("CancellationLowering", CancellationLoweringPass()) FUNCTION_PASS("ExpandAtomicModify", ExpandAtomicModifyPass()) #endif diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 90334e60251da..ae1351ae41ca1 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2068,7 +2068,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { } Value *callee = CI->getCalledOperand(); if (callee && (callee == gc_flush_func || callee == gc_preserve_begin_func - || callee == gc_preserve_end_func || callee == cancel_point_func)) { + || callee == gc_preserve_end_func)) { /* No replacement */ } else if (pointer_from_objref_func != nullptr && callee == pointer_from_objref_func) { auto *obj = CI->getOperand(0); diff --git a/src/passes.h b/src/passes.h index 0c5a124ade952..e40538f8f88da 100644 --- a/src/passes.h +++ b/src/passes.h @@ -43,6 +43,11 @@ struct FinalLowerGCPass : PassInfoMixin { static bool isRequired() { return true; } }; +struct CancellationLoweringPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) JL_NOTSAFEPOINT; + static bool isRequired() { return true; } +}; + struct ExpandAtomicModifyPass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) JL_NOTSAFEPOINT; }; diff --git a/src/pipeline.cpp b/src/pipeline.cpp index 0481e04b8d19e..afd2ac90f9f6b 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -568,6 +568,7 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder * JULIA_PASS(MPM.addPass(RemoveNIPass())); { FunctionPassManager FPM; + JULIA_PASS(FPM.addPass(CancellationLoweringPass())); // Lower cancellation points to setjmp (before GC lowering) JULIA_PASS(FPM.addPass(LateLowerGCPass())); JULIA_PASS(FPM.addPass(FinalLowerGCPass())); JULIA_PASS(FPM.addPass(ExpandAtomicModifyPass())); // after LateLowerGCPass so that all IPO is valid diff --git a/src/signals-unix.c b/src/signals-unix.c index ec34d97cd89bc..6f061af906352 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -545,6 +545,26 @@ static void jl_try_deliver_sigint(void) pthread_mutex_unlock(&in_signal_lock); } +// Send a signal to the specified thread to longjmp to its reset_ctx if available. +// This is used for task cancellation to interrupt a running task at a safe point. +JL_DLLEXPORT void jl_send_cancellation_signal(int16_t tid) +{ + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + if (ptls2 == NULL) + return; + jl_task_t *ct = jl_atomic_load_relaxed(&ptls2->current_task); + if (ct == NULL) + return; + // Only send if the task has a reset_ctx set (i.e., is at a cancellation point) + if (ct->reset_ctx == NULL) + return; + pthread_mutex_lock(&in_signal_lock); + signals_inflight++; + jl_atomic_store_release(&ptls2->signal_request, 5); + pthread_kill(ptls2->system_id, SIGUSR2); + pthread_mutex_unlock(&in_signal_lock); +} + // Write only by signal handling thread, read only by main thread // no sync necessary. static int thread0_exit_signo = 0; @@ -583,6 +603,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size) // is reached // 3: raise `thread0_exit_signo` and try to exit // 4: no-op +// 5: longjmp to reset_ctx if available (for task cancellation) void usr2_handler(int sig, siginfo_t *info, void *ctx) { jl_task_t *ct = jl_get_current_task(); @@ -640,6 +661,15 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx) else if (request == 3) { jl_call_in_ctx(ct->ptls, jl_exit_thread0_cb, sig, ctx); } + else if (request == 5) { + // Longjmp to reset_ctx for task cancellation + volatile _jl_ucontext_t *reset_ctx = ct->reset_ctx; + if (reset_ctx != NULL) { + // Clear reset_ctx before longjmp to prevent double-longjmp + ct->reset_ctx = NULL; + jl_longjmp_in_ctx(sig, ctx, reset_ctx->uc_mcontext); + } + } errno = errno_save; } diff --git a/src/signals-win.c b/src/signals-win.c index e0968b35e8555..d68b19298f005 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -667,4 +667,13 @@ void jl_install_thread_signal_handler(jl_ptls_t ptls) JL_DLLEXPORT void jl_membarrier(void) { FlushProcessWriteBuffers(); + + +// Send a signal to the specified thread to longjmp to its reset_ctx if available. +// This is used for task cancellation to interrupt a running task at a safe point. +// TODO: Implement Windows support using SuspendThread/GetThreadContext/SetThreadContext +JL_DLLEXPORT void jl_send_cancellation_signal(int16_t tid) +{ + // Not yet implemented on Windows + (void)tid; } diff --git a/test/cancellation.jl b/test/cancellation.jl index baa9f6e7afa46..6a673e3ebb4d3 100644 --- a/test/cancellation.jl +++ b/test/cancellation.jl @@ -13,7 +13,7 @@ const collatz_code = quote i += 1 end end - function find_collatz_counterexample_inner() + @noinline function find_collatz_counterexample_inner() i = 1 while true j = i diff --git a/test/llvmpasses/cancellation-lowering-codegen.jl b/test/llvmpasses/cancellation-lowering-codegen.jl new file mode 100644 index 0000000000000..b9395baa0408c --- /dev/null +++ b/test/llvmpasses/cancellation-lowering-codegen.jl @@ -0,0 +1,26 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +# RUN: julia --startup-file=no -O2 %s %t -O && llvm-link -S %t/* | FileCheck %s + +include(joinpath("..", "testhelpers", "llvmpasses.jl")) + +# Test that @cancel_check generates the expected cancellation lowering IR: +# - A jl_setjmp call with returns_twice attribute +# - A reset_ctx_ptr getelementptr +# - Atomic store of ucontext buffer to reset_ctx +# - Atomic store of null to reset_ctx before return (since ucontext is stack-allocated) + +# CHECK-LABEL: @julia_test_cancel_check +# CHECK: %cancel_ucontext = alloca +# CHECK: %reset_ctx_ptr = getelementptr +# CHECK: store atomic ptr {{.*}}, ptr %reset_ctx_ptr release +# CHECK: call i32 @{{.*}}setjmp{{.*}}(ptr {{.*}}) #[[ATTR:[0-9]+]] +# CHECK: store atomic ptr null, ptr %reset_ctx_ptr release +# CHECK: ret +# CHECK: attributes #[[ATTR]] = {{{.*}}returns_twice{{.*}}} +function test_cancel_check() + Base.@cancel_check + return 1 +end + +emit(test_cancel_check) diff --git a/test/llvmpasses/cancellation-lowering.ll b/test/llvmpasses/cancellation-lowering.ll new file mode 100644 index 0000000000000..22245b64288c3 --- /dev/null +++ b/test/llvmpasses/cancellation-lowering.ll @@ -0,0 +1,82 @@ +; This file is a part of Julia. License is MIT: https://julialang.org/license + +; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='CancellationLowering' -S %s | FileCheck %s + +declare i32 @julia.cancellation_point() +declare ptr @julia.get_pgcstack() +declare void @some_unsafe_call() +declare void @some_safe_call() + +; Test basic cancellation point lowering with reset_ctx cleared before return +define i32 @test_cancellation_point() { +entry: +; CHECK-LABEL: @test_cancellation_point +; CHECK: %cancel_ucontext = alloca +; CHECK: %pgcstack = call ptr @julia.get_pgcstack() +; CHECK: %current_task = getelementptr i8, ptr %pgcstack +; CHECK: %reset_ctx_ptr = getelementptr i8, ptr %current_task +; CHECK: store atomic ptr %cancel_ucontext, ptr %reset_ctx_ptr release +; CHECK: %{{.*}} = call i32 @{{.*}}setjmp{{.*}}(ptr %cancel_ucontext) +; CHECK-NOT: call i32 @julia.cancellation_point() +; CHECK: store atomic ptr null, ptr %reset_ctx_ptr release +; CHECK-NEXT: ret i32 + %pgcstack = call ptr @julia.get_pgcstack() + %result = call i32 @julia.cancellation_point() + ret i32 %result +} + +; Test that unsafe calls get reset_ctx = NULL inserted before them +define void @test_unsafe_call() { +entry: +; CHECK-LABEL: @test_unsafe_call +; CHECK: %cancel_ucontext = alloca +; CHECK: %pgcstack = call ptr @julia.get_pgcstack() +; CHECK: %current_task = getelementptr i8, ptr %pgcstack +; CHECK: %reset_ctx_ptr = getelementptr i8, ptr %current_task +; CHECK: store atomic ptr %cancel_ucontext, ptr %reset_ctx_ptr release +; CHECK: call i32 @{{.*}}setjmp +; The unsafe call should have reset_ctx = NULL before it +; CHECK: store atomic ptr null, ptr %reset_ctx_ptr release +; CHECK-NEXT: call void @some_unsafe_call() +; Also reset_ctx = NULL before the return +; CHECK: store atomic ptr null, ptr %reset_ctx_ptr release +; CHECK-NEXT: ret void + %pgcstack = call ptr @julia.get_pgcstack() + %result = call i32 @julia.cancellation_point() + call void @some_unsafe_call() + ret void +} + +; Test that calls with reset_safe metadata don't get reset_ctx = NULL before them +; but still get reset_ctx = NULL before return +define void @test_safe_call() { +entry: +; CHECK-LABEL: @test_safe_call +; CHECK: %cancel_ucontext = alloca +; CHECK: %pgcstack = call ptr @julia.get_pgcstack() +; CHECK: %current_task = getelementptr i8, ptr %pgcstack +; CHECK: %reset_ctx_ptr = getelementptr i8, ptr %current_task +; CHECK: call i32 @{{.*}}setjmp +; The safe call should NOT have reset_ctx = NULL before it +; CHECK: call void @some_safe_call(), !julia.reset_safe +; But reset_ctx = NULL should be before the return +; CHECK: store atomic ptr null, ptr %reset_ctx_ptr release +; CHECK-NEXT: ret void + %pgcstack = call ptr @julia.get_pgcstack() + %result = call i32 @julia.cancellation_point() + call void @some_safe_call(), !julia.reset_safe !0 + ret void +} + +; Test function without cancellation points is unchanged +define void @test_no_cancellation_point() { +entry: +; CHECK-LABEL: @test_no_cancellation_point +; CHECK-NOT: setjmp +; CHECK-NOT: reset_ctx +; CHECK: call void @some_unsafe_call() + call void @some_unsafe_call() + ret void +} + +!0 = !{} From 759d56c0e992537e06af822f12e4b58bfa6cc56e Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Thu, 4 Dec 2025 01:17:05 +0000 Subject: [PATCH 6/9] Fix missing reset --- src/task.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/task.c b/src/task.c index dbf98d3450a6e..94057e550e3a1 100644 --- a/src/task.c +++ b/src/task.c @@ -1686,6 +1686,7 @@ JL_DLLEXPORT void jl_preempt_thread_task(int16_t tid) jl_value_t *expected = jl_nothing; // If the task is already being cancelled, that's good enough for preemption jl_atomic_cmpswap(&task->cancellation_request, &expected, jl_box_uint8(0x5)); + jl_send_cancellation_signal(tid); } #ifdef _OS_WINDOWS_ From 7b52667bf42b8e527e991c1bd6ac921c20e99cc6 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Fri, 5 Dec 2025 03:52:27 +0000 Subject: [PATCH 7/9] Fix rebase mistake --- src/signals-win.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/signals-win.c b/src/signals-win.c index d68b19298f005..16b5734ab9c0d 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -667,7 +667,7 @@ void jl_install_thread_signal_handler(jl_ptls_t ptls) JL_DLLEXPORT void jl_membarrier(void) { FlushProcessWriteBuffers(); - +} // Send a signal to the specified thread to longjmp to its reset_ctx if available. // This is used for task cancellation to interrupt a running task at a safe point. From 044f2ca2fb2f4616a1605cd8d64a153d6e71c597 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Fri, 5 Dec 2025 08:31:27 +0000 Subject: [PATCH 8/9] Implement asymmetric barriers --- base/asyncevent.jl | 5 ++-- base/condition.jl | 49 ++++++++++++++++++++++++------- base/locks-mt.jl | 2 ++ base/task.jl | 73 +++++++++++++++++++++++++++++++++++----------- 4 files changed, 99 insertions(+), 30 deletions(-) diff --git a/base/asyncevent.jl b/base/asyncevent.jl index 6ac8e79b63ff8..771d89f796b58 100644 --- a/base/asyncevent.jl +++ b/base/asyncevent.jl @@ -199,8 +199,9 @@ function _trywait(t::Union{Timer, AsyncCondition}) return set end -cancel_wait!(t::Union{Timer, AsyncCondition}, task::Task) = - cancel_wait!(t.cond, task, false; waitee=t) +cancel_wait!(t::Union{Timer, AsyncCondition}, @nospecialize(creq)) = false +cancel_wait!(t::Union{Timer, AsyncCondition}, task::Task, @nospecialize(creq)) = + cancel_wait!(t.cond, task, creq, false; waitee=t) function wait(t::Union{Timer, AsyncCondition}) ok = _trywait(t) diff --git a/base/condition.jl b/base/condition.jl index b4aead93faba3..e3aa7a2dc59ed 100644 --- a/base/condition.jl +++ b/base/condition.jl @@ -142,35 +142,62 @@ Wait for [`notify`](@ref) on `c` and return the `val` parameter passed to `notif If the keyword `first` is set to `true`, the waiter will be put _first_ in line to wake up on `notify`. Otherwise, `wait` has first-in-first-out (FIFO) behavior. """ -function wait(c::GenericCondition; first::Bool=false, waitee=c) +function wait(c::GenericCondition; first::Bool=false, waitee=c, expected_cancellation=nothing) ct = current_task() _wait2(c, ct, waitee, first) + +@label before_barrier + # Synchronize with atomic_fence_heavy in cancel! + Threads.atomic_fence_light() + # We need to check if we were cancelled and should not suspend if we were. + # The fencing above ensures that we either see the cancellation request + # or the cancelling task will call cancel_wait! to wake us again. + cr = cancellation_request() + if cr !== expected_cancellation + if cr === CANCEL_REQUEST_YIELD + # We are about to yield anyway, so we can acknowledge the cancellation now. + # However, for the integrity of the cancellation_request syncrhonization, + # we must revisit the barrier above and re-check the cancellation request + @atomicreplace :acquire_release :monotonic ct.cancellation_request cr => nothing + @goto before_barrier + else + Base.list_deletefirst!(waitqueue(c), ct) + return invokelatest(cancel_wait!, waitee, cr) + end + end token = unlockall(c.lock) - try - return wait() + + ret = try + wait() catch - q = ct.queue; q === c && Base.list_deletefirst!(waitqueue(c), ct) - rethrow() - finally relockall(c.lock, token) + # This cleans up our entry in the waitqueue if we were resumes from an + # unexpected `throwto`. Modern code should generally avoid this pattern. + q = ct.queue; q === waitee && Base.list_deletefirst!(waitqueue(c), ct) + rethrow() end + + relockall(c.lock, token) + return ret +end + +function cancel_wait!(c::GenericCondition, creq; waitee = c) + throw(creq) end -function cancel_wait!(c::GenericCondition, t::Task; waitee = c) - @assert (@atomic :monotonic t.cancellation_request) !== nothing +function cancel_wait!(c::GenericCondition, t::Task, @nospecialize(creq); waitee = c) lock(c) if t.queue !== waitee unlock(c) return false end Base.list_deletefirst!(ILLRef(waitqueue(c), waitee), t) - schedule(t, conform_cancellation_request(t.cancellation_request), error=true) + schedule(t, conform_cancellation_request(creq), error=true) unlock(c) return true end -function cancel_wait!(c::GenericCondition, t::Task, @nospecialize(val); waitee=c) - @assert (@atomic :monotonic t.cancellation_request) !== nothing +function cancel_wait!(c::GenericCondition, t::Task, @nospecialize(creq), @nospecialize(val); waitee=c) lock(c) if t.queue !== waitee unlock(c) diff --git a/base/locks-mt.jl b/base/locks-mt.jl index 237e0d9856996..af8513044c0f3 100644 --- a/base/locks-mt.jl +++ b/base/locks-mt.jl @@ -108,3 +108,5 @@ end function islocked(l::AbstractSpinLock) return (@atomic :monotonic l.owned) != 0 end + +Base.show(io::IO, ::AbstractSpinLock) = print(io, typeof(ans), "(", islocked(ans) ? "locked" : "unlocked", ")") diff --git a/base/task.jl b/base/task.jl index 70b2e258de06f..65192ee44b91c 100644 --- a/base/task.jl +++ b/base/task.jl @@ -316,7 +316,7 @@ function _wait(t::Task; expected_cancellation = nothing) lock(donenotify) try while !istaskdone(t) && cancellation_request() === expected_cancellation - wait(donenotify; waitee=t) + wait(donenotify; waitee=t, expected_cancellation) end finally unlock(donenotify) @@ -324,7 +324,11 @@ function _wait(t::Task; expected_cancellation = nothing) end nothing end -cancel_wait!(waitee::Task, waiter::Task) = cancel_wait!(waitee.donenotify, waiter, nothing; waitee) + +# We handle cancellation explicitly above - just suppress the error here +cancel_wait!(waitee::Task, @nospecialize(creq)) = nothing +cancel_wait!(waitee::Task, waiter::Task, @nospecialize(creq)) = + cancel_wait!(waitee.donenotify, waiter, creq, nothing; waitee) # have `waiter` wait for `t` function _wait2(t::Task, waiter::Task) @@ -367,7 +371,7 @@ Throws a `ConcurrencyViolationError` if `t` is the currently running task, to pr """ function wait(t::Task; throw=true) _wait(t) - cr = cancellation_request() + cr = cancellation_request_or_yield() if cr !== nothing propagate_cancellation!(t, cr) end @@ -611,7 +615,7 @@ function sync_end(c::Channel{Any}) r = take!(c) if isa(r, Task) _wait(r) - cr = cancellation_request() + cr = cancellation_request_or_yield() if cr !== nothing return sync_cancel!(c, r, cr, @isdefined(c_ex) ? c_ex : CompositeException()) end @@ -1423,6 +1427,14 @@ end throw(req) end +function cancellation_request_raw() + ct = current_task() + req = @atomic :monotonic ct.cancellation_request + req === nothing && return req + req = @atomic :acquire ct.cancellation_request + return req +end + """ cancellation_request() @@ -1431,13 +1443,26 @@ cancellation has been requested. If a cancellation request is present, it is loaded with acquire semantics. """ function cancellation_request() - ct = current_task() - req = @atomic :monotonic ct.cancellation_request - req === nothing && return req - cr = @atomic :acquire ct.cancellation_request + cr = cancellation_request_raw() return conform_cancellation_request(cr) end +""" + cancellation_request_or_yield() + +Like [`cancellation_request`](@ref), but specifically handles CANCEL_REQUEST_YIELD +by calling yield internally and re-checking for cancellation requests. +""" +function cancellation_request_or_yield() + while true + _cr = cancellation_request_raw() + cr = conform_cancellation_request(_cr) + cr !== CANCEL_REQUEST_YIELD && return cr + @atomicreplace :sequentially_consistent :monotonic current_task().cancellation_request _cr => nothing + yield() + end +end + """ Core.cancellation_point!() @@ -1451,7 +1476,7 @@ Core.cancellation_point! function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE) # TODO: Raise task priority @atomic :release t.cancellation_request = crequest - # TODO: SYS_membarrier() ? + Threads.atomic_fence_heavy() # Special case: If the task hasn't started yet at this point, we want to set # it up to cancel any waits, but we need to be a bit careful with concurrent # starts of the task. @@ -1465,15 +1490,29 @@ function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE) end return end - # Try to interrupt the task if it's at a cancellation point (has reset_ctx set) + # Try to interrupt the task. The barrier above synchronizes with the establishment + # of a wait object and guarantees that either: + # 1. We have the wait object in t.queue, or + # 2. The task saw the cancellation and called (a different method of) cancel_wait! + # itself. + # Note that it is possible for both to be true, in which case the task wins + # and our call to cancel_wait! is will no-op after acquiring the waitee lock. + # + # Additionally, if there is no wait object, either + # 1. The task is suspended, but not using our wait object protocol. + # In this case, cancellation will not succeed. + # 2. The task is running. + # + # We can't tell the difference, but we unconditionally try to send the cancellation + # signal. If a reset_ctx exists, this will cause the task to be interrupted. tid = Threads.threadid(t) - if tid != 0 - ccall(:jl_send_cancellation_signal, Cvoid, (Int16,), (tid - 1) % Int16) - end - while !istaskdone(t) + if !istaskdone(t) waitee = t.queue - waitee === nothing && (yield(); continue) - invokelatest(cancel_wait!, waitee, t) && break + if waitee !== nothing + invokelatest(cancel_wait!, waitee, t, crequest) + elseif tid != 0 + ccall(:jl_send_cancellation_signal, Cvoid, (Int16,), (tid - 1) % Int16) + end end if t.sticky # If this task is sticky, it won't be able to run if the task currently @@ -1494,7 +1533,7 @@ function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE) end end -function cancel_wait!(q::StickyWorkqueue, t::Task) +function cancel_wait!(q::StickyWorkqueue, t::Task, @nospecialize(creq)) # Tasks in the workqueue are runnable - we do not cancel the wait, # but we do need to check whether it's in there lock(q.lock) From 0c85647707fd2de407bf863f095781e0983196ea Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Sat, 6 Dec 2025 08:16:28 +0000 Subject: [PATCH 9/9] Support write cancellation --- base/condition.jl | 20 ++----------- base/stream.jl | 70 ++++++++++++++++++++++++++++++++++++++-------- base/task.jl | 25 +++++++++++++++++ deps/libuv.version | 4 +-- src/jl_uv.c | 4 +-- src/jltypes.c | 1 - 6 files changed, 90 insertions(+), 34 deletions(-) diff --git a/base/condition.jl b/base/condition.jl index e3aa7a2dc59ed..c3520fdf379a9 100644 --- a/base/condition.jl +++ b/base/condition.jl @@ -146,24 +146,10 @@ function wait(c::GenericCondition; first::Bool=false, waitee=c, expected_cancell ct = current_task() _wait2(c, ct, waitee, first) -@label before_barrier - # Synchronize with atomic_fence_heavy in cancel! - Threads.atomic_fence_light() - # We need to check if we were cancelled and should not suspend if we were. - # The fencing above ensures that we either see the cancellation request - # or the cancelling task will call cancel_wait! to wake us again. - cr = cancellation_request() + cr = pre_sleep_cancellation_request() if cr !== expected_cancellation - if cr === CANCEL_REQUEST_YIELD - # We are about to yield anyway, so we can acknowledge the cancellation now. - # However, for the integrity of the cancellation_request syncrhonization, - # we must revisit the barrier above and re-check the cancellation request - @atomicreplace :acquire_release :monotonic ct.cancellation_request cr => nothing - @goto before_barrier - else - Base.list_deletefirst!(waitqueue(c), ct) - return invokelatest(cancel_wait!, waitee, cr) - end + Base.list_deletefirst!(waitqueue(c), ct) + return invokelatest(cancel_wait!, waitee, cr) end token = unlockall(c.lock) diff --git a/base/stream.jl b/base/stream.jl index 7b227458ec552..f74b900fd81c3 100644 --- a/base/stream.jl +++ b/base/stream.jl @@ -1061,28 +1061,51 @@ end uv_write(s::LibuvStream, p::Vector{UInt8}) = GC.@preserve p uv_write(s, pointer(p), UInt(sizeof(p))) # caller must have acquired the iolock -function uv_write(s::LibuvStream, p::Ptr{UInt8}, n::UInt) - uvw = uv_write_async(s, p, n) +function uv_write_noncancel(s::LibuvStream, p::Ptr{UInt8}, n::UInt) + # Establish the wait object early so that if we get cancelled, we don't + # have to go to libuv in first place. ct = current_task() + ct.queue = s + + cr = pre_sleep_cancellation_request() + if cr !== nothing + cr.queue = nothing + return 0 + end + + uvw = uv_write_async(s, p, n) + # TODO: If the request was split above, this is wrong + ct.next = uvw + preserve_handle(ct) sigatomic_begin() uv_req_set_data(uvw, ct) iolock_end() - local status + local nwritten try sigatomic_end() # wait for the last chunk to complete (or error) # assume that any errors would be sticky, # (so we don't need to monitor the error status of the intermediate writes) - status = wait()::Cint + nwritten = wait()::Csize_t sigatomic_begin() finally # try-finally unwinds the sigatomic level, so need to repeat sigatomic_end sigatomic_end() iolock_begin() - q = ct.queue; q === nothing || Base.list_deletefirst!(q::IntrusiveLinkedList{Task}, ct) + if ct.queue === s + # Only happens in unexpected error cases. Cancellation queues a proper + # callback, which unsets this. + ct.next = nothing + ct.queue = nothing + end if uv_req_data(uvw) != C_NULL - # uvw is still alive, + # uvw is still alive - likely because we got some unexpected throwto + # exception. We try to cancel the request to avoid spamming if this + # is something the user is looking at. Note that cancellation does + # not go through this path and instead returns the number of written + # bytes to the caller. + ccall(:uv_cancel, Cint, (Ptr{Cvoid},), uvw) # Ignore errors # so make sure we won't get spurious notifications later uv_req_set_data(uvw, C_NULL) else @@ -1092,10 +1115,27 @@ function uv_write(s::LibuvStream, p::Ptr{UInt8}, n::UInt) iolock_end() unpreserve_handle(ct) end - if status < 0 - throw(_UVError("write", status)) + return Int(nwritten) +end + +function uv_write(s::LibuvStream, p::Ptr{UInt8}, n::UInt) + nb = uv_write_noncancel(s, p, n) + @cancel_check + @assert nb == n + return nb +end + +function cancel_wait!(s::LibuvStream, t::Task, @nospecialize(creq)) + iolock_begin() + if t.queue !== s + iolock_end() + return false end - return Int(n) + uvw = t.next + @assert uvw !== nothing && uvw != C_NULL + ccall(:uv_cancel, Cint, (Ptr{Cvoid},), uvw) # Ignore errors + iolock_end() + return true end # helper function for uv_write that returns the uv_write_t struct for the write @@ -1111,7 +1151,7 @@ function uv_write_async(s::LibuvStream, p::Ptr{UInt8}, n::UInt) Int32, (Ptr{Cvoid}, Ptr{Cvoid}, UInt, Ptr{Cvoid}, Ptr{Cvoid}), s, p, nwrite, uvw, - @cfunction(uv_writecb_task, Cvoid, (Ptr{Cvoid}, Cint))) + @cfunction(uv_writecb_task, Cvoid, (Ptr{Cvoid}, Cint, Csize_t))) if err < 0 Libc.free(uvw) uv_error("write", err) @@ -1188,12 +1228,18 @@ function write(s::LibuvStream, b::UInt8) return write(s, Ref{UInt8}(b)) end -function uv_writecb_task(req::Ptr{Cvoid}, status::Cint) +function uv_writecb_task(req::Ptr{Cvoid}, status::Cint, nwritten::Csize_t) d = uv_req_data(req) if d != C_NULL uv_req_set_data(req, C_NULL) # let the Task know we got the writecb t = unsafe_pointer_to_objref(d)::Task - schedule(t, status) + t.next = nothing + t.queue = nothing + if status != 0 && status != UV_ECANCELED + schedule(t, _UVError("write", status); error=true) + else + schedule(t, nwritten) + end else # no owner for this req, safe to just free it Libc.free(req) diff --git a/base/task.jl b/base/task.jl index 65192ee44b91c..0d075f293f672 100644 --- a/base/task.jl +++ b/base/task.jl @@ -1463,6 +1463,31 @@ function cancellation_request_or_yield() end end +""" + cancellation_request_or_yield() + +Like [`cancellation_request_or_yield`](@ref), but indicates the caller is about to sleep, +so yield requests can be ignored. Additionally, contains necessary synchronization to +ensure that either the cancellation request is visible, or that any potential +cancellation task will see the wait object established by the caller. + +Precondition: The caller must have established a wait object in `current_task().queue`. +""" +function pre_sleep_cancellation_request() + #@assert (@atomic :monotonic current_task().queue) !== nothing + + # Synchronize with atomic_fence_heavy in cancel! + Threads.atomic_fence_light() + + while true + _cr = cancellation_request_raw() + cr = conform_cancellation_request(_cr) + cr !== CANCEL_REQUEST_YIELD && return cr + @atomicreplace :sequentially_consistent :monotonic current_task().cancellation_request _cr => nothing + # The caller is about to sleep, so we are permitted to ignore the yield request. + end +end + """ Core.cancellation_point!() diff --git a/deps/libuv.version b/deps/libuv.version index f80cde8964237..57201f6a82210 100644 --- a/deps/libuv.version +++ b/deps/libuv.version @@ -5,5 +5,5 @@ LIBUV_JLL_NAME := LibUV ## source build LIBUV_VER := 2 -LIBUV_BRANCH=julia-uv2-1.48.0 -LIBUV_SHA1=b21d6d84e46f6c97ecbc8e4e8a8ea6ad98049ea8 +LIBUV_BRANCH=kf/julia-writecancel +LIBUV_SHA1=89b5f68b38698a4151f35b10ba9380363614097a diff --git a/src/jl_uv.c b/src/jl_uv.c index e41b896320693..5a45f9ff998d6 100644 --- a/src/jl_uv.c +++ b/src/jl_uv.c @@ -677,14 +677,14 @@ JL_DLLEXPORT int jl_fs_close(uv_os_fd_t handle) } JL_DLLEXPORT int jl_uv_write(uv_stream_t *stream, const char *data, size_t n, - uv_write_t *uvw, uv_write_cb writecb) + uv_write_t *uvw, uv_write3_cb writecb) { uv_buf_t buf[1]; buf[0].base = (char*)data; buf[0].len = n; JL_UV_LOCK(); JL_SIGATOMIC_BEGIN(); - int err = uv_write(uvw, stream, buf, 1, writecb); + int err = uv_write3(uvw, stream, buf, 1, NULL, 0, writecb); JL_UV_UNLOCK(); JL_SIGATOMIC_END(); return err; diff --git a/src/jltypes.c b/src/jltypes.c index bb5a0beb98494..e80bd6be644c8 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -3829,7 +3829,6 @@ void jl_init_types(void) JL_GC_DISABLED 0, 1, 6); XX(task); jl_value_t *listt = jl_new_struct(jl_uniontype_type, jl_task_type, jl_nothing_type); - jl_svecset(jl_task_type->types, 0, listt); // Set field 20 (metrics_enabled) as const // Set fields 8 (_state) and 24-27 (metric counters), 28 (cancellation_request) as atomic const static uint32_t task_constfields[1] = { 0b00000000000010000000000000000000 };