From 9e94aa49eed3591e7c53531af0a9722a32a2a6d4 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliahub.com>
Date: Thu, 4 Dec 2025 03:54:27 +0000
Subject: [PATCH 1/9] threads: Implement asymmetric atomic fences

Asymmetric atomic fences are a performance optimization of regular
atomic fences (the seq_cst version of which we expose as
`Base.Threads.atomic_fence`). The problem with these regular fences
is that they require a CPU fence instruction, which can be very
expensive and is thus unsuitable for code in the hot path.
Asymmetric fences on the other hand split an ordinary fence into
two: A `light` side where the fence is extremely cheap (only a
compiler reordering barrier) and a `heavy` side where the fence
is very expensive.

Basically the way it works is that the heavy side does a system call
that issues an inter-processor-interrupt (IPI) which then issues
the appropriate barrier instruction on the other CPU (i.e. both
CPUs will have issues a barrier instruction, one of them
just does it asynchronously due to interrupt).

The `light` and `heavy` naming here is taken from C++ PR1202R5 [1],
which is the proposal for the same feature in the C++ standard
library (to appear in the next iteration of the C++ concurrency
spec).

On the julia side, these functions are exposed as
`Threads.atomic_fence_light` and `Threads.atomic_fence_heavy`.
The light side lowers to `fence singlethread` in llvm IR (the
Core.Intrinsic atomic_fence is adjusted appropriately to faciliate
this). The heavy side has OS-specifc implementations, where:

1. Linux/FreeBSD try to use the `membarrier` syscall or a
   fallback to `mprotect` for systems that don't have it.
2. Windows uses the `FlushProcessWriteBuffers` syscall.
3. macOS uses an implementation from the dotnet runtime
   (https://github.com/dotnet/runtime/pull/44670), which
   the dotnet folks have checked with Apple does the right
   thing by happenstance (i.e. an IPI/memory barrier is needed
   to execute the syscall), but looks a little nonsensical by itself.
   However, since it's what Apple recommended to dotnet, I don't
   see much risk here, though I wouldn't be surprised if Apple added
   a proper syscall for this in the future (since freebsd has it now).

Note that unlike the C++ spec, I have specified that
`atomic_fence_heavy` does synchronize with `atomic_fence`. This
matches the underlying system call. I suspect C++ chose to omit
this for a hypothetical future architecture that has instruction
support for doing this from userspace that would then not
synchronize with ordinary barriers, but I think I would rather
cross that bridge when we get there.

I intend to use this in #60281, but it's an independently useful
feature.

[1] https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1202r5.pdf
---
 Compiler/src/tfuncs.jl    |   4 +-
 base/asyncevent.jl        |   2 +-
 base/atomics.jl           |  26 +++++++-
 src/ast.c                 |   2 +
 src/intrinsics.cpp        |  15 ++++-
 src/intrinsics.h          |   2 +-
 src/jl_exported_funcs.inc |   1 +
 src/julia_internal.h      |   4 +-
 src/runtime_intrinsics.c  |   9 ++-
 src/signals-mach.c        |  52 ++++++++++++++++
 src/signals-unix.c        | 125 ++++++++++++++++++++++++++++++++++++++
 src/signals-win.c         |   4 ++
 test/intrinsics.jl        |  12 ++--
 test/threads_exec.jl      |  48 +++++++++++++++
 14 files changed, 290 insertions(+), 16 deletions(-)

diff --git a/Compiler/src/tfuncs.jl b/Compiler/src/tfuncs.jl
index 3c23974c88920..2656c7bb1ee1a 100644
--- a/Compiler/src/tfuncs.jl
+++ b/Compiler/src/tfuncs.jl
@@ -716,7 +716,7 @@ end
 @nospecs function pointerset_tfunc(𝕃::AbstractLattice, a, v, i, align)
     return a
 end
-@nospecs function atomic_fence_tfunc(𝕃::AbstractLattice, order)
+@nospecs function atomic_fence_tfunc(𝕃::AbstractLattice, order, syncscope)
     return Nothing
 end
 @nospecs function atomic_pointerref_tfunc(𝕃::AbstractLattice, a, order)
@@ -757,7 +757,7 @@ add_tfunc(add_ptr, 2, 2, pointerarith_tfunc, 1)
 add_tfunc(sub_ptr, 2, 2, pointerarith_tfunc, 1)
 add_tfunc(pointerref, 3, 3, pointerref_tfunc, 4)
 add_tfunc(pointerset, 4, 4, pointerset_tfunc, 5)
-add_tfunc(atomic_fence, 1, 1, atomic_fence_tfunc, 4)
+add_tfunc(atomic_fence, 2, 2, atomic_fence_tfunc, 4)
 add_tfunc(atomic_pointerref, 2, 2, atomic_pointerref_tfunc, 4)
 add_tfunc(atomic_pointerset, 3, 3, atomic_pointerset_tfunc, 5)
 add_tfunc(atomic_pointerswap, 3, 3, atomic_pointerswap_tfunc, 5)
diff --git a/base/asyncevent.jl b/base/asyncevent.jl
index a4a82b4aba120..68ae27049adc0 100644
--- a/base/asyncevent.jl
+++ b/base/asyncevent.jl
@@ -165,7 +165,7 @@ function _trywait(t::Union{Timer, AsyncCondition})
     set = t.set
     if set
         # full barrier now for AsyncCondition
-        t isa Timer || Core.Intrinsics.atomic_fence(:acquire_release)
+        t isa Timer || Core.Intrinsics.atomic_fence(:acquire_release, :system)
     else
         if !isopen(t)
             set = t.set
diff --git a/base/atomics.jl b/base/atomics.jl
index 432c9120939ac..155cb38d6d03b 100644
--- a/base/atomics.jl
+++ b/base/atomics.jl
@@ -329,4 +329,28 @@ fences should not be necessary in most cases.
 
 For further details, see LLVM's `fence` instruction.
 """
-atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent)
+atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :system)
+
+"""
+    Threads.atomic_fence_light()
+
+This is a read-optimized sequential-consistency memory fence.
+On supported operating systems and architectures, this fence is cheaper
+than `Threads.atomic_fence()`, but synchronizes only with
+[`atomic_fence_heavy`](@ref) calls from other threads.
+"""
+atomic_fence_light() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :singlethread)
+
+"""
+    Threads.atomic_fence_heavy()
+
+This is a write-optimized sequential-consistency memory fence.
+This fence is significantly more expensive than `Threads.atomic_fence`.
+It generally requires a system call and a full interprocessor interrupt
+to all other processors in the system. It synchronizes with both
+[`atomic_fence_light`](@ref) and [`atomic_fence`](@ref) calls from other threads.
+
+For further details, see the Linux `membarrier` syscall or the Windows
+`FlushProcessWriteBuffers` API.
+"""
+atomic_fence_heavy() = ccall(:jl_membarrier, Cvoid, ())
diff --git a/src/ast.c b/src/ast.c
index d6e3893751c9f..f513ef34b01eb 100644
--- a/src/ast.c
+++ b/src/ast.c
@@ -319,6 +319,8 @@ void jl_init_common_symbols(void)
     jl_atomic_sym = jl_symbol("atomic");
     jl_not_atomic_sym = jl_symbol("not_atomic");
     jl_unordered_sym = jl_symbol("unordered");
+    jl_singlethread_sym = jl_symbol("singlethread");
+    jl_system_sym = jl_symbol("system");
     jl_monotonic_sym = jl_symbol("monotonic");
     jl_acquire_sym = jl_symbol("acquire");
     jl_release_sym = jl_symbol("release");
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index ae25c3cc83ca5..4791d439e2ad0 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -915,6 +915,15 @@ static jl_cgval_t emit_pointerarith(jl_codectx_t &ctx, intrinsic f,
 static jl_cgval_t emit_atomicfence(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
 {
     const jl_cgval_t &ord = argv[0];
+    const jl_cgval_t &ssid_arg = argv[1];
+    llvm::SyncScope::ID ssid = llvm::SyncScope::System;
+    if (!ssid_arg.constant || !jl_is_symbol(ssid_arg.constant) ||
+        ((jl_sym_t*)ssid_arg.constant != jl_singlethread_sym &&
+            (jl_sym_t*)ssid_arg.constant != jl_system_sym)) {
+        return emit_runtime_call(ctx, atomic_fence, argv, 2);
+    }
+    if ((jl_sym_t*)ssid_arg.constant == jl_singlethread_sym)
+        ssid = llvm::SyncScope::SingleThread;
     if (ord.constant && jl_is_symbol(ord.constant)) {
         enum jl_memory_order order = jl_get_atomic_order((jl_sym_t*)ord.constant, true, true);
         if (order == jl_memory_order_invalid) {
@@ -922,10 +931,10 @@ static jl_cgval_t emit_atomicfence(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
             return jl_cgval_t(); // unreachable
         }
         if (order > jl_memory_order_monotonic)
-            ctx.builder.CreateFence(get_llvm_atomic_order(order));
+            ctx.builder.CreateFence(get_llvm_atomic_order(order), ssid);
         return ghostValue(ctx, jl_nothing_type);
     }
-    return emit_runtime_call(ctx, atomic_fence, argv, 1);
+    return emit_runtime_call(ctx, atomic_fence, argv, 2);
 }
 
 static jl_cgval_t emit_atomic_pointerref(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
@@ -1339,7 +1348,7 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
 
     case atomic_fence:
         ++Emitted_atomic_fence;
-        assert(nargs == 1);
+        assert(nargs == 2);
         return emit_atomicfence(ctx, argv);
     case atomic_pointerref:
         ++Emitted_atomic_pointerref;
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 5765e3e671bc6..4fd5630afa38f 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -95,7 +95,7 @@
     ADD_I(pointerref, 3) \
     ADD_I(pointerset, 4) \
     /*  pointer atomics */ \
-    ADD_I(atomic_fence, 1) \
+    ADD_I(atomic_fence, 2) \
     ADD_I(atomic_pointerref, 2) \
     ADD_I(atomic_pointerset, 3) \
     ADD_I(atomic_pointerswap, 3) \
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index d0cfb16062b7d..1a0ff14a82b55 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -506,6 +506,7 @@
     XX(jl_vprintf) \
     XX(jl_wakeup_thread) \
     XX(jl_write_compiler_output) \
+    XX(jl_membarrier) \
 
 #define JL_RUNTIME_EXPORTED_FUNCS_WIN(XX) \
     XX(jl_setjmp) \
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 94036981f950a..9a85d1086d694 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1689,7 +1689,7 @@ STATIC_INLINE int is_valid_intrinsic_elptr(jl_value_t *ety)
 JL_DLLEXPORT jl_value_t *jl_bitcast(jl_value_t *ty, jl_value_t *v);
 JL_DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i, jl_value_t *align);
 JL_DLLEXPORT jl_value_t *jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *align, jl_value_t *i);
-JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order);
+JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order, jl_value_t *syncscope);
 JL_DLLEXPORT jl_value_t *jl_atomic_pointerref(jl_value_t *p, jl_value_t *order);
 JL_DLLEXPORT jl_value_t *jl_atomic_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *order);
 JL_DLLEXPORT jl_value_t *jl_atomic_pointerswap(jl_value_t *p, jl_value_t *x, jl_value_t *order);
@@ -2010,6 +2010,8 @@ JL_DLLEXPORT int jl_isabspath(const char *in) JL_NOTSAFEPOINT;
     XX(uninferred_sym) \
     XX(unordered_sym) \
     XX(unused_sym) \
+    XX(singlethread_sym) \
+    XX(system_sym)
 
 #define XX(name) extern JL_DLLEXPORT jl_sym_t *jl_##name;
 JL_COMMON_SYMBOLS(XX)
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index 31dd3e085033c..d03a97e93d14e 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -622,9 +622,16 @@ JL_DLLEXPORT jl_value_t *jl_atomic_pointerreplace(jl_value_t *p, jl_value_t *exp
     return result;
 }
 
-JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym)
+JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym, jl_value_t *syncscope_sym)
 {
     JL_TYPECHK(fence, symbol, order_sym);
+    JL_TYPECHK(fence, symbol, syncscope_sym);
+    if ((jl_sym_t*)syncscope_sym == jl_singlethread_sym) {
+        asm volatile ("" : : : "memory");
+        return jl_nothing;
+    } else if ((jl_sym_t*)syncscope_sym != jl_system_sym) {
+        jl_error("atomic_fence: invalid syncscope");
+    }
     enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1);
     if (order > jl_memory_order_monotonic)
         jl_fence();
diff --git a/src/signals-mach.c b/src/signals-mach.c
index 1ef3e9d23094a..e138ca01f90a0 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -5,6 +5,7 @@
 #include <mach/clock.h>
 #include <mach/clock_types.h>
 #include <mach/clock_reply.h>
+#include <mach/thread_state.h>
 #include <mach/mach_traps.h>
 #include <mach/task.h>
 #include <mach/mig_errors.h>
@@ -891,3 +892,54 @@ JL_DLLEXPORT void jl_profile_stop_timer(void)
     profile_all_tasks = 0;
     uv_mutex_unlock(&bt_data_prof_lock);
 }
+
+// The mprotect implementation in signals-unix.c does not work on macOS/aarch64, as mentioned.
+// This implementation comes from dotnet, but is similarly dependent on undocumented behavior of the OS.
+// Copyright (c) .NET Foundation and Contributors
+// MIT LICENSE
+JL_DLLEXPORT void jl_membarrier(void) {
+    mach_msg_type_number_t cThreads;
+    thread_act_t *pThreads;
+    kern_return_t machret = task_threads(mach_task_self(), &pThreads, &cThreads);
+    HANDLE_MACH_ERROR("task_threads()", machret);
+
+    uintptr_t sp;
+    uintptr_t registerValues[128];
+
+    // Iterate through each of the threads in the list.
+    for (mach_msg_type_number_t i = 0; i < cThreads; i++)
+    {
+        if (__builtin_available (macOS 10.14, iOS 12, tvOS 9, *))
+        {
+            // Request the threads pointer values to force the thread to emit a memory barrier
+            size_t registers = 128;
+            machret = thread_get_register_pointer_values(pThreads[i], &sp, &registers, registerValues);
+        }
+        else
+        {
+            // fallback implementation for older OS versions
+#if defined(_CPU_X86_64_)
+            x86_thread_state64_t threadState;
+            mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT;
+            machret = thread_get_state(pThreads[i], x86_THREAD_STATE64, (thread_state_t)&threadState, &count);
+#elif defined(_CPU_AARCH64_)
+            arm_thread_state64_t threadState;
+            mach_msg_type_number_t count = ARM_THREAD_STATE64_COUNT;
+            machret = thread_get_state(pThreads[i], ARM_THREAD_STATE64, (thread_state_t)&threadState, &count);
+#else
+            #error Unexpected architecture
+#endif
+        }
+
+        if (machret == KERN_INSUFFICIENT_BUFFER_SIZE)
+        {
+            HANDLE_MACH_ERROR("thread_get_register_pointer_values()", machret);
+        }
+
+        machret = mach_port_deallocate(mach_task_self(), pThreads[i]);
+        HANDLE_MACH_ERROR("mach_port_deallocate()", machret);
+    }
+    // Deallocate the thread list now we're done with it.
+    machret = vm_deallocate(mach_task_self(), (vm_address_t)pThreads, cThreads * sizeof(thread_act_t));
+    HANDLE_MACH_ERROR("vm_deallocate()", machret);
+}
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 16e70ef0f764e..12f49b42d5df6 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -1274,3 +1274,128 @@ JL_DLLEXPORT int jl_repl_raise_sigtstp(void)
 {
     return raise(SIGTSTP);
 }
+
+// Linux and FreeBSD have compatible membarrier support
+#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_)
+#if defined(_OS_LINUX_)
+# include <sys/syscall.h>
+# if defined(__has_include)
+#   if __has_include(<linux/membarrier.h>)
+#       include <linux/membarrier.h>
+#       define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
+#   else
+#     if defined(__NR_membarrier)
+enum membarrier_cmd {
+    MEMBARRIER_CMD_QUERY                        = 0,
+    MEMBARRIER_CMD_PRIVATE_EXPEDITED            = (1 << 3),
+    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED   = (1 << 4),
+};
+#         define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
+#     else
+#         warning "Missing linux kernel headers for membarrier syscall, support disabled"
+#         define membarrier(...) -ENOSYS
+#     endif
+#  endif
+# else
+#   include <linux/membarrier.h>
+# endif
+#elif defined(_OS_FREEBSD_)
+# include <sys/param.h>
+# if __FreeBSD_version >= 1401500
+#   include <sys/membarrier.h>
+# else
+#   define MEMBARRIER_CMD_QUERY                         0x00
+#   define MEMBARRIER_CMD_PRIVATE_EXPEDITED             0x08
+#   define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED    0x10
+#   define membarrier(...) -ENOSYS
+# endif
+#endif
+
+// Implementation of the `mprotect` based membarrier fallback.
+// This is a common fallback based on the observation that `mprotect` happens to
+// issue the necessary memory barriers. However, there is no spec that
+// guarantees this behavior, and indeed AArch64 macos does not. However, we
+// only use it as a fallback here for older versions of Linux and FreeBSD where
+// we know that it happens to work.
+static pthread_mutex_t mprotect_barrier_lock = PTHREAD_MUTEX_INITIALIZER;
+static _Atomic(uint64_t) *mprotect_barrier_page = NULL;
+static void jl_init_mprotect_membarrier(void)
+{
+    int result = pthread_mutex_lock(&mprotect_barrier_lock);
+    assert(result == 0);
+    if (mprotect_barrier_page == NULL) {
+        size_t pagesize = jl_getpagesize();
+
+        mprotect_barrier_page = (_Atomic(uint64_t) *)
+                                     mmap(NULL, pagesize, PROT_NONE,
+                                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (mprotect_barrier_page == MAP_FAILED) {
+            jl_safe_printf("fatal: failed to allocate barrier page.\n");
+            abort();
+        }
+        result = mlock(mprotect_barrier_page, pagesize);
+        if (result != 0) {
+            jl_safe_printf("fatal: failed to mlock barrier page.\n");
+            abort();
+        }
+    }
+    result = pthread_mutex_unlock(&mprotect_barrier_lock);
+    assert(result == 0);
+    (void)result;
+}
+
+static void jl_mprotect_membarrier(void)
+{
+    int result = pthread_mutex_lock(&mprotect_barrier_lock);
+    assert(result == 0);
+    size_t pagesize = jl_getpagesize();
+    result = mprotect(mprotect_barrier_page, pagesize, PROT_NONE);
+    jl_atomic_fetch_add_relaxed(mprotect_barrier_page, 1);
+    assert(result == 0);
+    result = mprotect(mprotect_barrier_page, pagesize, PROT_READ | PROT_WRITE);
+    assert(result == 0);
+    result = pthread_mutex_unlock(&mprotect_barrier_lock);
+    assert(result == 0);
+    (void)result;
+}
+
+// Implementation of `jl_membarrier`
+enum membarrier_implementation {
+    MEMBARRIER_IMPLEMENTATION_UNKNOWN        = 0,
+    MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER = 1,
+    MEMBARRIER_IMPLEMENTATION_MPROTECT       = 2
+};
+
+static _Atomic(enum membarrier_implementation) membarrier_impl = MEMBARRIER_IMPLEMENTATION_UNKNOWN;
+
+static enum membarrier_implementation jl_init_membarrier(void) {
+    int ret = membarrier(MEMBARRIER_CMD_QUERY, 0);
+    int needed = MEMBARRIER_CMD_PRIVATE_EXPEDITED | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED;
+    if (ret > 0 && ((ret & needed) == needed)) {
+        // supported
+        if (membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0) == 0) {
+            // working
+            jl_atomic_store_relaxed(&membarrier_impl, MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER);
+            return MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER;
+        }
+    }
+    jl_init_mprotect_membarrier();
+    jl_atomic_store_relaxed(&membarrier_impl, MEMBARRIER_IMPLEMENTATION_MPROTECT);
+    return MEMBARRIER_IMPLEMENTATION_MPROTECT;
+}
+
+JL_DLLEXPORT void jl_membarrier(void) {
+    enum membarrier_implementation impl = jl_atomic_load_relaxed(&membarrier_impl);
+    if (impl == MEMBARRIER_IMPLEMENTATION_UNKNOWN) {
+        impl = jl_init_membarrier();
+    }
+    if (impl == MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER) {
+        int ret = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0);
+        assert(ret);
+        (void)ret;
+    } else {
+        assert(impl == MEMBARRIER_IMPLEMENTATION_MPROTECT);
+        jl_mprotect_membarrier();
+    }
+}
+#endif
diff --git a/src/signals-win.c b/src/signals-win.c
index 1702bc338b16f..e0968b35e8555 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -664,3 +664,7 @@ void jl_install_thread_signal_handler(jl_ptls_t ptls)
         have_backtrace_fiber = 1;
     }
 }
+
+JL_DLLEXPORT void jl_membarrier(void) {
+    FlushProcessWriteBuffers();
+}
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 5e18c1fb3672a..03ba86f85676e 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -395,13 +395,13 @@ end
 end
 
 using Base.Experimental: @force_compile
-@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(:u)) === nothing
-@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(Symbol("u", "x"))) === nothing
-@test_throws ConcurrencyViolationError("invalid atomic ordering") Core.Intrinsics.atomic_fence(Symbol("u", "x")) === nothing
+@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(:u, :system)) === nothing
+@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(Symbol("u", "x"), :system)) === nothing
+@test_throws ConcurrencyViolationError("invalid atomic ordering") Core.Intrinsics.atomic_fence(Symbol("u", "x"), :system) === nothing
 for order in (:not_atomic, :monotonic, :acquire, :release, :acquire_release, :sequentially_consistent)
-    @test Core.Intrinsics.atomic_fence(order) === nothing
-    @test (order -> Core.Intrinsics.atomic_fence(order))(order) === nothing
-    @test Base.invokelatest(@eval () -> Core.Intrinsics.atomic_fence($(QuoteNode(order)))) === nothing
+    @test Core.Intrinsics.atomic_fence(order, :system) === nothing
+    @test (order -> Core.Intrinsics.atomic_fence(order, :system))(order) === nothing
+    @test Base.invokelatest(@eval () -> Core.Intrinsics.atomic_fence($(QuoteNode(order)), :system)) === nothing
 end
 @test Core.Intrinsics.atomic_pointerref(C_NULL, :sequentially_consistent) === nothing
 @test (@force_compile; Core.Intrinsics.atomic_pointerref(C_NULL, :sequentially_consistent)) === nothing
diff --git a/test/threads_exec.jl b/test/threads_exec.jl
index 2780888546964..d9590b76e78c1 100644
--- a/test/threads_exec.jl
+++ b/test/threads_exec.jl
@@ -464,6 +464,54 @@ function test_fence()
 end
 test_fence()
 
+# Test asymmetric thread fences
+const asymmetric_test_count = 200_000
+struct AsymmetricFenceTestData
+    x::AtomicMemory{Int}
+    y::AtomicMemory{Int}
+    read_x::AtomicMemory{Int}
+    read_y::AtomicMemory{Int}
+end
+function test_asymmetric_fence(data::AsymmetricFenceTestData, cond1, cond2, threadid, it)
+    if (threadid % 2) == 0
+        @atomic :monotonic data.x[it] = 1
+        Threads.atomic_fence_heavy()
+        @atomic :monotonic data.read_y[it] = @atomic :monotonic data.y[it]
+        wait(cond1)
+        notify(cond2)
+    else
+        @atomic :monotonic data.y[it] = 1
+        Threads.atomic_fence_light()
+        @atomic :monotonic data.read_x[it] = data.x[it]
+        notify(cond1)
+        wait(cond2)
+    end
+end
+function test_asymmetric_fence(data, cond1, cond2, threadid)
+    for i = 1:asymmetric_test_count
+        test_asymmetric_fence(data, cond1, cond2, threadid, i)
+    end
+end
+function test_asymmetric_fence()
+    cond1 = Threads.Event(true)
+    cond2 = Threads.Event(true)
+    data = AsymmetricFenceTestData(AtomicMemory{Int}(undef, asymmetric_test_count),
+                                  AtomicMemory{Int}(undef, asymmetric_test_count),
+                                  AtomicMemory{Int}(undef, asymmetric_test_count),
+                                  AtomicMemory{Int}(undef, asymmetric_test_count))
+    for i = 1:asymmetric_test_count
+        @atomic :monotonic data.x[i] = 0
+        @atomic :monotonic data.y[i] = 0
+        @atomic :monotonic data.read_x[i] = typemax(Int)
+        @atomic :monotonic data.read_y[i] = typemax(Int)
+    end
+    t1 = @Threads.spawn test_asymmetric_fence(data, cond1, cond2, 1)
+    t2 = @Threads.spawn test_asymmetric_fence(data, cond1, cond2, 2)
+    wait(t1); wait(t2)
+    @test !any((data.read_x .== 0) .& (data.read_y .== 0))
+end
+test_asymmetric_fence()
+
 # Test load / store with various types
 let atomictypes = (Int8, Int16, Int32, Int64, Int128,
                    UInt8, UInt16, UInt32, UInt64, UInt128,

From 6911fa0c3c3bbafa330feebbd021acf2fcefecd6 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliahub.com>
Date: Fri, 5 Dec 2025 00:12:16 +0000
Subject: [PATCH 2/9] Address review

---
 NEWS.md                         |  4 ++
 base/atomics.jl                 | 26 +++++-----
 doc/src/base/multi-threading.md |  2 +
 src/runtime_intrinsics.c        |  2 +-
 src/signals-mach.c              | 24 +++------
 src/signals-unix.c              | 87 +++++++++++++++++----------------
 test/threads_exec.jl            | 18 ++++---
 7 files changed, 82 insertions(+), 81 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index bd72e673728f0..571ad6f37ac80 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -21,6 +21,10 @@ Command-line option changes
 Multi-threading changes
 -----------------------
 
+  - New functions `Threads.atomic_fence_heavy` and `Threads.atoimc_fence_light` provide support for
+    asymmetric atomic fences, speeding up atomic synchronization where one side of the synchronization
+    runs significantly less often than the other ([#60311]).
+
 Build system changes
 --------------------
 
diff --git a/base/atomics.jl b/base/atomics.jl
index 155cb38d6d03b..21689e06ba1a1 100644
--- a/base/atomics.jl
+++ b/base/atomics.jl
@@ -10,7 +10,7 @@ export
     atomic_add!, atomic_sub!,
     atomic_and!, atomic_nand!, atomic_or!, atomic_xor!,
     atomic_max!, atomic_min!,
-    atomic_fence
+    atomic_fence, atomic_fence_light, atomic_fence_heavy
 
 """
     Threads.Atomic{T}
@@ -334,23 +334,23 @@ atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :system)
 """
     Threads.atomic_fence_light()
 
-This is a read-optimized sequential-consistency memory fence.
-On supported operating systems and architectures, this fence is cheaper
-than `Threads.atomic_fence()`, but synchronizes only with
-[`atomic_fence_heavy`](@ref) calls from other threads.
+Insert the light side of an asymmetric sequential-consistency memory fence.
+Asymmetric memory fences are useful in scenarios where one side of the
+synchronization runs significantly less often than the other side. Use this
+function on the side that runs often and [`atomic_fence_heavy`](@ref) on the
+side that runs rarely.
+
+On supported operating systems and architectures this fence is cheaper than
+`Threads.atomic_fence()`, but synchronizes only with [`atomic_fence_heavy`](@ref)
+calls from other threads.
 """
 atomic_fence_light() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :singlethread)
 
 """
     Threads.atomic_fence_heavy()
 
-This is a write-optimized sequential-consistency memory fence.
-This fence is significantly more expensive than `Threads.atomic_fence`.
-It generally requires a system call and a full interprocessor interrupt
-to all other processors in the system. It synchronizes with both
-[`atomic_fence_light`](@ref) and [`atomic_fence`](@ref) calls from other threads.
-
-For further details, see the Linux `membarrier` syscall or the Windows
-`FlushProcessWriteBuffers` API.
+Insert the heavy side of an asymmetric sequential-consistency memory fence.
+Use this function on the side that runs rarely.
+See [`atomic_fence_light`](@ref) for more details.
 """
 atomic_fence_heavy() = ccall(:jl_membarrier, Cvoid, ())
diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md
index 88dc2b7514a2a..0b7d545e54d08 100644
--- a/doc/src/base/multi-threading.md
+++ b/doc/src/base/multi-threading.md
@@ -50,6 +50,8 @@ Base.Threads.atomic_xor!
 Base.Threads.atomic_max!
 Base.Threads.atomic_min!
 Base.Threads.atomic_fence
+Base.Threads.atomic_fence_heavy
+Base.Threads.atomic_fence_light
 ```
 
 ## ccall using a libuv threadpool (Experimental)
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index d03a97e93d14e..c2abb705fd7c6 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -626,13 +626,13 @@ JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym, jl_value_t *sync
 {
     JL_TYPECHK(fence, symbol, order_sym);
     JL_TYPECHK(fence, symbol, syncscope_sym);
+    enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1);
     if ((jl_sym_t*)syncscope_sym == jl_singlethread_sym) {
         asm volatile ("" : : : "memory");
         return jl_nothing;
     } else if ((jl_sym_t*)syncscope_sym != jl_system_sym) {
         jl_error("atomic_fence: invalid syncscope");
     }
-    enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1);
     if (order > jl_memory_order_monotonic)
         jl_fence();
     return jl_nothing;
diff --git a/src/signals-mach.c b/src/signals-mach.c
index e138ca01f90a0..55a430425dce5 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -898,22 +898,20 @@ JL_DLLEXPORT void jl_profile_stop_timer(void)
 // Copyright (c) .NET Foundation and Contributors
 // MIT LICENSE
 JL_DLLEXPORT void jl_membarrier(void) {
-    mach_msg_type_number_t cThreads;
-    thread_act_t *pThreads;
-    kern_return_t machret = task_threads(mach_task_self(), &pThreads, &cThreads);
-    HANDLE_MACH_ERROR("task_threads()", machret);
-
     uintptr_t sp;
     uintptr_t registerValues[128];
+    kern_return_t machret;
 
     // Iterate through each of the threads in the list.
-    for (mach_msg_type_number_t i = 0; i < cThreads; i++)
-    {
+    int nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    for (int tid = 0; tid < nthreads; tid++) {
+        jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+        thread_act_t thread = pthread_mach_thread_np(ptls2->system_id);
         if (__builtin_available (macOS 10.14, iOS 12, tvOS 9, *))
         {
             // Request the threads pointer values to force the thread to emit a memory barrier
             size_t registers = 128;
-            machret = thread_get_register_pointer_values(pThreads[i], &sp, &registers, registerValues);
+            machret = thread_get_register_pointer_values(thread, &sp, &registers, registerValues);
         }
         else
         {
@@ -921,11 +919,11 @@ JL_DLLEXPORT void jl_membarrier(void) {
 #if defined(_CPU_X86_64_)
             x86_thread_state64_t threadState;
             mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT;
-            machret = thread_get_state(pThreads[i], x86_THREAD_STATE64, (thread_state_t)&threadState, &count);
+            machret = thread_get_state(thread, x86_THREAD_STATE64, (thread_state_t)&threadState, &count);
 #elif defined(_CPU_AARCH64_)
             arm_thread_state64_t threadState;
             mach_msg_type_number_t count = ARM_THREAD_STATE64_COUNT;
-            machret = thread_get_state(pThreads[i], ARM_THREAD_STATE64, (thread_state_t)&threadState, &count);
+            machret = thread_get_state(thread, ARM_THREAD_STATE64, (thread_state_t)&threadState, &count);
 #else
             #error Unexpected architecture
 #endif
@@ -935,11 +933,5 @@ JL_DLLEXPORT void jl_membarrier(void) {
         {
             HANDLE_MACH_ERROR("thread_get_register_pointer_values()", machret);
         }
-
-        machret = mach_port_deallocate(mach_task_self(), pThreads[i]);
-        HANDLE_MACH_ERROR("mach_port_deallocate()", machret);
     }
-    // Deallocate the thread list now we're done with it.
-    machret = vm_deallocate(mach_task_self(), (vm_address_t)pThreads, cThreads * sizeof(thread_act_t));
-    HANDLE_MACH_ERROR("vm_deallocate()", machret);
 }
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 12f49b42d5df6..e62f9d738d8ba 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -1275,48 +1275,15 @@ JL_DLLEXPORT int jl_repl_raise_sigtstp(void)
     return raise(SIGTSTP);
 }
 
-// Linux and FreeBSD have compatible membarrier support
-#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_)
-#if defined(_OS_LINUX_)
-# include <sys/syscall.h>
-# if defined(__has_include)
-#   if __has_include(<linux/membarrier.h>)
-#       include <linux/membarrier.h>
-#       define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
-#   else
-#     if defined(__NR_membarrier)
-enum membarrier_cmd {
-    MEMBARRIER_CMD_QUERY                        = 0,
-    MEMBARRIER_CMD_PRIVATE_EXPEDITED            = (1 << 3),
-    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED   = (1 << 4),
-};
-#         define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
-#     else
-#         warning "Missing linux kernel headers for membarrier syscall, support disabled"
-#         define membarrier(...) -ENOSYS
-#     endif
-#  endif
-# else
-#   include <linux/membarrier.h>
-# endif
-#elif defined(_OS_FREEBSD_)
-# include <sys/param.h>
-# if __FreeBSD_version >= 1401500
-#   include <sys/membarrier.h>
-# else
-#   define MEMBARRIER_CMD_QUERY                         0x00
-#   define MEMBARRIER_CMD_PRIVATE_EXPEDITED             0x08
-#   define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED    0x10
-#   define membarrier(...) -ENOSYS
-# endif
-#endif
-
+#if !defined(_OS_DARWIN_)
 // Implementation of the `mprotect` based membarrier fallback.
 // This is a common fallback based on the observation that `mprotect` happens to
 // issue the necessary memory barriers. However, there is no spec that
-// guarantees this behavior, and indeed AArch64 macos does not. However, we
-// only use it as a fallback here for older versions of Linux and FreeBSD where
-// we know that it happens to work.
+// guarantees this behavior, and indeed AArch64 Darwin does not (so we don't use it
+// there). However, we only use it as a fallback here for older versions of
+// Linux and FreeBSD where we know that it happens to work. We also use it as a
+// fallback for unknown Unix systems under the assumption that it will work,
+// but this is not guaranteed.
 static pthread_mutex_t mprotect_barrier_lock = PTHREAD_MUTEX_INITIALIZER;
 static _Atomic(uint64_t) *mprotect_barrier_page = NULL;
 static void jl_init_mprotect_membarrier(void)
@@ -1335,7 +1302,7 @@ static void jl_init_mprotect_membarrier(void)
         }
         result = mlock(mprotect_barrier_page, pagesize);
         if (result != 0) {
-            jl_safe_printf("fatal: failed to mlock barrier page.\n");
+            jl_safe_printf("fatal: failed to mlock barrier page (try increasing RLIMIT_MEMLOCK with `ulimit -l`).\n");
             abort();
         }
     }
@@ -1349,15 +1316,43 @@ static void jl_mprotect_membarrier(void)
     int result = pthread_mutex_lock(&mprotect_barrier_lock);
     assert(result == 0);
     size_t pagesize = jl_getpagesize();
-    result = mprotect(mprotect_barrier_page, pagesize, PROT_NONE);
+    result = mprotect(mprotect_barrier_page, pagesize, PROT_READ | PROT_WRITE);
     jl_atomic_fetch_add_relaxed(mprotect_barrier_page, 1);
     assert(result == 0);
-    result = mprotect(mprotect_barrier_page, pagesize, PROT_READ | PROT_WRITE);
+    result = mprotect(mprotect_barrier_page, pagesize, PROT_NONE);
     assert(result == 0);
     result = pthread_mutex_unlock(&mprotect_barrier_lock);
     assert(result == 0);
     (void)result;
 }
+#endif
+
+// Linux and FreeBSD have compatible membarrier support
+#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_)
+#if defined(_OS_LINUX_)
+#   include <sys/syscall.h>
+#   if defined(__NR_membarrier)
+enum membarrier_cmd {
+    MEMBARRIER_CMD_QUERY                        = 0,
+    MEMBARRIER_CMD_PRIVATE_EXPEDITED            = (1 << 3),
+    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED   = (1 << 4),
+};
+#    define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
+#  else
+#    warning "Missing linux kernel headers for membarrier syscall, support disabled"
+#    define membarrier(...) (errno = ENOSYS, -1)
+#  endif
+#elif defined(_OS_FREEBSD_)
+#  include <sys/param.h>
+#  if __FreeBSD_version >= 1401500
+#    include <sys/membarrier.h>
+#  else
+#    define MEMBARRIER_CMD_QUERY                         0x00
+#    define MEMBARRIER_CMD_PRIVATE_EXPEDITED             0x08
+#    define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED    0x10
+#    define membarrier(...) (errno = ENOSYS, -1)
+#  endif
+#endif
 
 // Implementation of `jl_membarrier`
 enum membarrier_implementation {
@@ -1391,11 +1386,17 @@ JL_DLLEXPORT void jl_membarrier(void) {
     }
     if (impl == MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER) {
         int ret = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0);
-        assert(ret);
+        assert(ret == 0);
         (void)ret;
     } else {
         assert(impl == MEMBARRIER_IMPLEMENTATION_MPROTECT);
         jl_mprotect_membarrier();
     }
 }
+#elif !defined(_OS_DARWIN_)
+JL_DLLEXPORT void jl_membarrier(void) {
+    if (!mprotect_barrier_page)
+        jl_init_mprotect_membarrier();
+    jl_mprotect_membarrier();
+}
 #endif
diff --git a/test/threads_exec.jl b/test/threads_exec.jl
index d9590b76e78c1..6fb4a9f2b7b89 100644
--- a/test/threads_exec.jl
+++ b/test/threads_exec.jl
@@ -465,8 +465,8 @@ end
 test_fence()
 
 # Test asymmetric thread fences
-const asymmetric_test_count = 200_000
 struct AsymmetricFenceTestData
+    n::Int
     x::AtomicMemory{Int}
     y::AtomicMemory{Int}
     read_x::AtomicMemory{Int}
@@ -482,23 +482,25 @@ function test_asymmetric_fence(data::AsymmetricFenceTestData, cond1, cond2, thre
     else
         @atomic :monotonic data.y[it] = 1
         Threads.atomic_fence_light()
-        @atomic :monotonic data.read_x[it] = data.x[it]
+        @atomic :monotonic data.read_x[it] = @atomic :monotonic data.x[it]
         notify(cond1)
         wait(cond2)
     end
 end
-function test_asymmetric_fence(data, cond1, cond2, threadid)
-    for i = 1:asymmetric_test_count
+function test_asymmetric_fence(data::AsymmetricFenceTestData, cond1, cond2, threadid)
+    for i = 1:data.n
         test_asymmetric_fence(data, cond1, cond2, threadid, i)
     end
 end
 function test_asymmetric_fence()
+    asymmetric_test_count = 200_000
     cond1 = Threads.Event(true)
     cond2 = Threads.Event(true)
-    data = AsymmetricFenceTestData(AtomicMemory{Int}(undef, asymmetric_test_count),
-                                  AtomicMemory{Int}(undef, asymmetric_test_count),
-                                  AtomicMemory{Int}(undef, asymmetric_test_count),
-                                  AtomicMemory{Int}(undef, asymmetric_test_count))
+    data = AsymmetricFenceTestData(asymmetric_test_count,
+                                   AtomicMemory{Int}(undef, asymmetric_test_count),
+                                   AtomicMemory{Int}(undef, asymmetric_test_count),
+                                   AtomicMemory{Int}(undef, asymmetric_test_count),
+                                   AtomicMemory{Int}(undef, asymmetric_test_count))
     for i = 1:asymmetric_test_count
         @atomic :monotonic data.x[i] = 0
         @atomic :monotonic data.y[i] = 0

From 4900004770605aaa86fe7742b51baf81a650aa87 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliahub.com>
Date: Fri, 28 Nov 2025 20:36:23 +0000
Subject: [PATCH 3/9] Very WIP: Architecture for robust cancellation

This commit is a first sketch for what I would like to do for robust cancellation
(i.e. "Making ^C just work"). At this point it's more of a sketch than a real PR,
but I think I've done enough of the design for a design discussion.

The first thing I should say is that the goals of this PR is very narrowly to
make ^C work well. As part of that, we're taking a bit of a step towards
structured concurrency, but I am not intending this PR to be a full implementation
of that.

Given that some of this has been beaten to death in previous issues, I will also
not do my usual motivation overview, instead jumping straight into the implementation.
As I said, the motivation is just to make ^C work reliably at this point.

Broadly when we're trying to cancel a task, it'll be in one of two broad categories:

1. Waiting for some other operation to complete (e.g. an IO operation, another task,
   an external event, etc.). Here, the actual cancellation itself is not so difficult
   (after all the task is not running, but suspended in a somehwat well-defined place).
   However, robust cancellation requires us to potentially propagate the cancellation
   signal down the wait tree, since the operation we actually want to cancel may not
   be the root task, but may instead be some operation being performed by the task
   we're waiting on (and we'd prefer not to leak those operations and have rogue tasks
   going around performing potentially side-effecting operations).

2. Currently running and doing some computation. The core problem is not really one of
   propagation (after all the long-running computation is probably what we're wanting
   to cancel), but rather how to do the cancellation without state corruption. A lot of
   the crashiness of our existing ^C implementation is just that we would simply inject
   an exception in places that are not expecting to handle it.

For a full solution to the problem, we need to have an answer for both of these points.
I will begin with the second, since the first builds upon it.

This PR introduces the concept of a `cancellation request` and a `cancellation point`.
Each task has a `cancellation_request` field that can be set externally (e.g. by ^C).
Any task performing computation should regularly check this field and abort its
computation if a cancellation request is pending.

For this purpose, the PR provides the `@cancel_check` macro. This macro turns a pending
cancellation request into a well-modeled exception. Package authors should insert a
call to the macro into any long-running loops. However, there is of course some overhead
to the check and it is therefor inappropriate for tight inner loops.

We attempt to address this with compiler support. Note that this part is currently
incompletely implemented, so the following describes the design rather than the current
state of the PR. Consider the cancel_check macro:
```
macro cancel_check()
    quote
        local req = Core.cancellation_point!()
        if req !== nothing
            throw(conform_cancellation_request(req))
        end
    end
end
```

where `cancellation_point!` is a new intrinsic that defines a cancellation point. The
compiler is semantically permitted to extend the cancellation point across any following
effect_free calls (note for transitivity reasons, the effect is not exactly the same,
but is morally equivalent). Upon passing a `cancellation_point!`, the system will
set the current task's `reset_ctx` to this cancellation point. If a cancellation request
occurs before the `reset_ctx` is cleared, the task's execution will be reset to the
nearest cancellation point. I proposed this mechanism in #52291.

Additionally, the `reset_ctx` can in principle be used to establish scoped cancellation
handlers for external C libraries as well although I suspect that there are not many
C libraries that are actually reset safe in the required manner (since allocation is not).

Note that `cancellation_point!` is also intended to be a yield point in order to faciliate
the ^C mechanism described below. However, this is not currently implemented.

Turning our attention now to the first of the two cases mentioned above, we tweak the task's
existing `queue` reference to become a generic (atomic) "waitee" reference. The queue is
required to be obtainable from with object via the new `waitqueue` generic function.
To cancel a `waiter` waiting for a waitable `waitee` object, we

1. Set the waiter's cancellation request
2. Load the `waitee` and call a new generic function `cancel_wait!`,
   which shall do whatever synchronization and internal bookkeeping is
   required to remove the task from the wait-queue and then resumes the
   task.
3. The `waiter` resumes in the wait code. It may now decide how and whether to
   propagate the cancellation to the object it was just waiting on. Note that
   this may involve re-queing a wait (to wait for the cancellation of `waitee`
   to complete).

The idea here is that this provides a well-defined context for cancellation-propagation
logic to run. I wanted to avoid having any cancellation propagation logic run in parallel
with actual wait code.

How the cancellation propagates is a bit of a policy question and not one that I fully
intend to address in this PR. My plan is to implement a basic state machine that works
well for ^C (by requesting safe cancellation immediately and then requesting increasingly
unsafe modes of cancellation upon timeout or repeated ^C), but I anticipate that external
libraries will want to create their own cancellation request state machines, which the
system supports. The implementation is incomplete, so I will not describe it here yet.

One may note that there are a significant number of additional fully dynamic dispatches
in this scheme (at least `waitqueue` and `cancel_wait!` and possibly in the future).
However, note that these dynamic dispatches are confined to the cancellation path, which
is not throughput-sensitive (but is latency sensitive).

The handling of ^C is delegated to a dedicated task that then gets notified from the
signal handler when a SIGINT is received (similar to the existing profile listener)
task. There is a little bit of an additional wrinkle in that we need some logic to
kick out a computational-task to its nearset cancellation point if we do not have
any idle threads. This logic is not yet implemented.

```
julia> sleep(1000)
^CERROR: CancellationRequest: Safe Cancellation (CANCEL_REQUEST_SAFE)
Stacktrace:
 [1] macro expansion
   @ ./condition.jl:134 [inlined]
 [2] _trywait(t::Timer)
   @ Base ./asyncevent.jl:195
 [3] wait
   @ ./asyncevent.jl:204 [inlined]
 [4] sleep(sec::Int64)
   @ Base ./asyncevent.jl:322
 [5] top-level scope
   @ REPL[1]:1

julia> function find_collatz_counterexample()
          i = 1
          while true
             j = i
             while true
                @Base.cancel_check
                j = collatz(j)
                j == 1 && break
                j == i && error("$j is a collatz counterexample")
             end
             i += 1
          end
       end
find_collatz_counterexample (generic function with 1 method)

julia> find_collatz_counterexample()
^CERROR: CancellationRequest: Safe Cancellation (CANCEL_REQUEST_SAFE)
Stacktrace:
 [1] macro expansion
   @ ./condition.jl:134 [inlined]
 [2] find_collatz_counterexample()
   @ Main ./REPL[2]:6
 [3] top-level scope
   @ REPL[3]:1

julia> wait(@async sleep(100))
^CERROR: TaskFailedException
Stacktrace:
 [1] wait(t::Task; throw::Bool)
   @ Base ./task.jl:367
 [2] wait(t::Task)
   @ Base ./task.jl:360
 [3] top-level scope
   @ REPL[4]:0
 [4] macro expansion
   @ task.jl:729 [inlined]

    nested task error: CancellationRequest: Safe Cancellation (CANCEL_REQUEST_SAFE)
    Stacktrace:
     [1] macro expansion
       @ ./condition.jl:134 [inlined]
     [2] _trywait(t::Timer)
       @ Base ./asyncevent.jl:195
     [3] wait
       @ ./asyncevent.jl:204 [inlined]
     [4] sleep
       @ ./asyncevent.jl:322 [inlined]
     [5] (::var"#2#3")()
       @ Main ./REPL[4]:1

julia> @sync begin
         @async sleep(100)
         @async find_collatz_counterexample()
     end
^CERROR:     nested task error: CancellationRequest: Safe Cancellation (CANCEL_REQUEST_SAFE)
    Stacktrace:
     [1] macro expansion
       @ ./task.jl:1234 [inlined]
     [2] _trywait(t::Timer)
       @ Base ~/julia-cancel/usr/share/julia/base/asyncevent.jl:195
     [3] wait
       @ ./asyncevent.jl:203 [inlined]
     [4] sleep
       @ ./asyncevent.jl:321 [inlined]
     [5] (::var"#45#46")()
       @ Main ./REPL[26]:3

...and 1 more exception.

Stacktrace:
 [1] sync_cancel!(c::Channel{Any}, t::Task, cr::Any, c_ex::CompositeException)
   @ Base ~/julia-cancel/usr/share/julia/base/task.jl:1454
 [2] sync_end(c::Channel{Any})
   @ Base ~/julia-cancel/usr/share/julia/base/task.jl:608
 [3] macro expansion
   @ ./task.jl:663 [inlined]
 [4] (::var"#43#44")()
   @ Main ./REPL[5]
```

As noted above, the `@Base.cancel_check` is not intended to be required in the inner loop.
Rather, the compiler is expected to extend the cancelation point from the start of the loop
to the entire function. However, this is not yet implemented.
---
 base/Base.jl                  |  30 +++++
 base/asyncevent.jl            |   1 +
 base/condition.jl             |  37 +++++-
 base/linked_list.jl           |  44 +++++--
 base/task.jl                  | 215 ++++++++++++++++++++++++++++++++--
 src/builtin_proto.h           |   1 +
 src/builtins.c                |  10 ++
 src/codegen.cpp               |  56 +++++++++
 src/jltypes.c                 |  14 ++-
 src/julia_locks.h             |   1 +
 src/julia_threads.h           |  13 +-
 src/llvm-late-gc-lowering.cpp |   2 +-
 src/llvm-pass-helpers.cpp     |   3 +-
 src/llvm-pass-helpers.h       |   1 +
 src/signal-handling.c         |  18 +++
 src/signals-unix.c            |  17 ++-
 src/task.c                    |   2 +
 stdlib/REPL/src/REPL.jl       |   4 +
 18 files changed, 425 insertions(+), 44 deletions(-)

diff --git a/base/Base.jl b/base/Base.jl
index 57d9915239fdf..eeb047125ce34 100644
--- a/base/Base.jl
+++ b/base/Base.jl
@@ -371,6 +371,35 @@ function start_profile_listener()
     ccall(:jl_set_peek_cond, Cvoid, (Ptr{Cvoid},), cond.handle)
 end
 
+function sigint_listener(cond::AsyncCondition)
+    while _trywait(cond)
+        # The SIGINT handler should have set a cancellation request on the roottask
+        cr = @atomic :acquire roottask.cancellation_request
+        cr === nothing && continue
+        cancel!(roottask, cr)
+    end
+    nothing
+end
+
+function start_sigint_listener()
+    cond = AsyncCondition()
+    uv_unref(cond.handle)
+    t = errormonitor(Threads.@spawn(sigint_listener(cond)))
+    atexit() do
+        # destroy this callback when exiting
+        ccall(:jl_set_sigint_cond, Cvoid, (Ptr{Cvoid},), C_NULL)
+        # this will prompt any ongoing or pending event to flush also
+        close(cond)
+        # error-propagation is not needed, since the errormonitor will handle printing that better
+        t === current_task() || _wait(t)
+    end
+    finalizer(cond) do c
+        # if something goes south, still make sure we aren't keeping a reference in C to this
+        ccall(:jl_set_sigint_cond, Cvoid, (Ptr{Cvoid},), C_NULL)
+    end
+    ccall(:jl_set_sigint_cond, Cvoid, (Ptr{Cvoid},), cond.handle)
+end
+
 function __init__()
     # Base library init
     global _atexit_hooks_finished = false
@@ -394,6 +423,7 @@ function __init__()
         # triggering a profile via signals is not implemented on windows
         start_profile_listener()
     end
+    start_sigint_listener()
     _require_world_age[] = get_world_counter()
     # Prevent spawned Julia process from getting stuck waiting on Tracy to connect.
     delete!(ENV, "JULIA_WAIT_FOR_TRACY")
diff --git a/base/asyncevent.jl b/base/asyncevent.jl
index 68ae27049adc0..fa9bb520cc855 100644
--- a/base/asyncevent.jl
+++ b/base/asyncevent.jl
@@ -192,6 +192,7 @@ function _trywait(t::Union{Timer, AsyncCondition})
                 unlock(t.cond)
                 unpreserve_handle(t)
             end
+            @cancel_check()
         end
         iolock_end()
     end
diff --git a/base/condition.jl b/base/condition.jl
index fd771c9be346a..fff9943966268 100644
--- a/base/condition.jl
+++ b/base/condition.jl
@@ -69,6 +69,8 @@ struct GenericCondition{L<:AbstractLock}
     GenericCondition(l::AbstractLock) = new{typeof(l)}(IntrusiveLinkedList{Task}(), l)
 end
 
+waitqueue(c::GenericCondition) = ILLRef(c.waitq, c)
+
 show(io::IO, c::GenericCondition) = print(io, GenericCondition, "(", c.lock, ")")
 
 assert_havelock(c::GenericCondition) = assert_havelock(c.lock)
@@ -84,9 +86,9 @@ function _wait2(c::GenericCondition, waiter::Task, first::Bool=false)
     ct = current_task()
     assert_havelock(c)
     if first
-        pushfirst!(c.waitq, waiter)
+        pushfirst!(waitqueue(c), waiter)
     else
-        push!(c.waitq, waiter)
+        push!(waitqueue(c), waiter)
     end
     # since _wait2 is similar to schedule, we should observe the sticky bit now
     if waiter.sticky && Threads.threadid(waiter) == 0 && !GC.in_finalizer()
@@ -125,6 +127,15 @@ proceeding.
 """
 function wait end
 
+macro cancel_check()
+    quote
+        local req = Core.cancellation_point!()
+        if req !== nothing
+            throw(conform_cancellation_request(req))
+        end
+    end
+end
+
 """
     wait(c::GenericCondition; first::Bool=false)
 
@@ -133,18 +144,32 @@ Wait for [`notify`](@ref) on `c` and return the `val` parameter passed to `notif
 If the keyword `first` is set to `true`, the waiter will be put _first_
 in line to wake up on `notify`. Otherwise, `wait` has first-in-first-out (FIFO) behavior.
 """
-function wait(c::GenericCondition; first::Bool=false)
+function wait(c::GenericCondition; first::Bool=false, cancel_check::Bool=true)
     ct = current_task()
     _wait2(c, ct, first)
     token = unlockall(c.lock)
     try
         return wait()
     catch
-        q = ct.queue; q === nothing || Base.list_deletefirst!(q::IntrusiveLinkedList{Task}, ct)
+        q = ct.queue; q === c && Base.list_deletefirst!(waitqueue(c), ct)
         rethrow()
     finally
         relockall(c.lock, token)
     end
+    cancel_check && @cancel_check()
+end
+
+function cancel_wait!(c::GenericCondition, t::Task)
+    @assert (@atomic :monotonic t.cancellation_request) !== nothing
+    lock(c)
+    if t.queue !== c
+        unlock(c)
+        return false
+    end
+    Base.list_deletefirst!(waitqueue(c), t)
+    enq_work(t)
+    unlock(c)
+    return true
 end
 
 """
@@ -160,8 +185,8 @@ Return the count of tasks woken up. Return 0 if no tasks are waiting on `conditi
 function notify(c::GenericCondition, @nospecialize(arg), all, error)
     assert_havelock(c)
     cnt = 0
-    while !isempty(c.waitq)
-        t = popfirst!(c.waitq)
+    while !isempty(waitqueue(c))
+        t = popfirst!(waitqueue(c))
         schedule(t, arg, error=error)
         cnt += 1
         all || break
diff --git a/base/linked_list.jl b/base/linked_list.jl
index c477dc56bdb2b..cf3be4115f73d 100644
--- a/base/linked_list.jl
+++ b/base/linked_list.jl
@@ -1,12 +1,18 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 mutable struct IntrusiveLinkedList{T}
-    # Invasive list requires that T have a field `.next >: U{T, Nothing}` and `.queue >: U{ILL{T}, Nothing}`
+    # Invasive list requires that T have a field `.next >: U{T, Nothing}` and `.queue::Any`
     head::Union{T, Nothing}
     tail::Union{T, Nothing}
     IntrusiveLinkedList{T}() where {T} = new{T}(nothing, nothing)
 end
 
+struct ILLRef{T}
+    list::IntrusiveLinkedList{T}
+    waitee::Any # Invariant: waitqueue(waitee) === list
+end
+waitqueue(list::IntrusiveLinkedList{T}) where {T} = ILLRef(list, list)
+
 #const list_append!! = append!
 #const list_deletefirst! = delete!
 
@@ -49,9 +55,13 @@ function list_append!!(q::IntrusiveLinkedList{T}, q2::IntrusiveLinkedList{T}) wh
     return q
 end
 
-function push!(q::IntrusiveLinkedList{T}, val::T) where T
+isempty(qr::ILLRef{T}) where T = isempty(qr.list)
+length(qr::ILLRef{T}) where T = length(qr.list)
+
+function push!(qr::ILLRef{T}, val::T) where T
     val.queue === nothing || error("val already in a list")
-    val.queue = q
+    val.queue = qr.waitee
+    q = qr.list
     tail = q.tail
     if tail === nothing
         q.head = q.tail = val
@@ -62,9 +72,10 @@ function push!(q::IntrusiveLinkedList{T}, val::T) where T
     return q
 end
 
-function pushfirst!(q::IntrusiveLinkedList{T}, val::T) where T
+function pushfirst!(qr::ILLRef{T}, val::T) where T
     val.queue === nothing || error("val already in a list")
-    val.queue = q
+    val.queue = qr.waitee
+    q = qr.list
     head = q.head
     if head === nothing
         q.head = q.tail = val
@@ -75,21 +86,23 @@ function pushfirst!(q::IntrusiveLinkedList{T}, val::T) where T
     return q
 end
 
-function pop!(q::IntrusiveLinkedList{T}) where {T}
-    val = q.tail::T
+function pop!(qr::ILLRef{T}) where {T}
+    val = qr.list.tail::T
     list_deletefirst!(q, val) # expensive!
     return val
 end
 
-function popfirst!(q::IntrusiveLinkedList{T}) where {T}
-    val = q.head::T
-    list_deletefirst!(q, val) # cheap
+function popfirst!(qr::ILLRef{T}) where {T}
+    val = qr.list.head::T
+    list_deletefirst!(qr, val) # cheap
     return val
 end
 
 # this function assumes `val` is found in `q`
-function list_deletefirst!(q::IntrusiveLinkedList{T}, val::T) where T
-    val.queue === q || return
+function list_deletefirst!(qr::ILLRef{T}, val::T) where T
+#    (val.queue === qr.waitee ||
+#     val.queue === qr.list) || throw(ConcurrencyViolationError("attempt to delete from wrong list"))
+    q = qr.list
     head = q.head::T
     if head === val
         if q.tail::T === val
@@ -115,6 +128,13 @@ function list_deletefirst!(q::IntrusiveLinkedList{T}, val::T) where T
     return q
 end
 
+# TODO: Delete this compatibility wrapper
+list_deletefirst!(q::IntrusiveLinkedList{T}, val::T) where T = list_deletefirst!(ILLRef(q, q), val)
+push!(q::IntrusiveLinkedList{T}, val::T) where T = push!(ILLRef(q, q), val)
+pushfirst!(q::IntrusiveLinkedList{T}, val::T) where T = pushfirst!(ILLRef(q, q), val)
+pop!(q::IntrusiveLinkedList{T}) where T = pop!(ILLRef(q, q))
+popfirst!(q::IntrusiveLinkedList{T}) where T = popfirst!(ILLRef(q, q))
+
 #function list_deletefirst!(q::Array{T}, val::T) where T
 #    i = findfirst(isequal(val), q)
 #    i === nothing || deleteat!(q, i)
diff --git a/base/task.jl b/base/task.jl
index 244a8f70a768a..077d571a5fb6f 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -302,14 +302,14 @@ function task_local_storage(body::Function, key, val)
 end
 
 # just wait for a task to be done, no error propagation
-function _wait(t::Task)
+function _wait(t::Task; expected_cancellation = nothing)
     t === current_task() && Core.throw(ConcurrencyViolationError("deadlock detected: cannot wait on current task"))
     if !istaskdone(t)
         donenotify = t.donenotify::ThreadSynchronizer
         lock(donenotify)
         try
-            while !istaskdone(t)
-                wait(donenotify)
+            while !istaskdone(t) && cancellation_request() === expected_cancellation
+                wait(donenotify; cancel_check=false)
             end
         finally
             unlock(donenotify)
@@ -359,12 +359,28 @@ Throws a `ConcurrencyViolationError` if `t` is the currently running task, to pr
 """
 function wait(t::Task; throw=true)
     _wait(t)
+    cr = cancellation_request()
+    if cr !== nothing
+        propagate_cancellation!(t, cr)
+    end
     if throw && istaskfailed(t)
         Core.throw(TaskFailedException(t))
     end
     nothing
 end
 
+"""
+    wait_nocancel(t::Task)
+
+Like `wait`, but do not propagate cancellation of this task to the waited-on task.
+"""
+function wait_nocancel(t::Task; throw=true)
+    _wait(t)
+    if throw && istaskfailed(t)
+        Core.throw(TaskFailedException(t))
+    end
+end
+
 # Wait multiple tasks
 
 """
@@ -587,6 +603,10 @@ function sync_end(c::Channel{Any})
         r = take!(c)
         if isa(r, Task)
             _wait(r)
+            cr = cancellation_request()
+            if cr !== nothing
+                return sync_cancel!(c, r, cr, @isdefined(c_ex) ? c_ex : CompositeException())
+            end
             if istaskfailed(r)
                 if !@isdefined(c_ex)
                     c_ex = CompositeException()
@@ -899,12 +919,13 @@ mutable struct IntrusiveLinkedListSynchronized{T}
     lock::Threads.SpinLock
     IntrusiveLinkedListSynchronized{T}() where {T} = new(IntrusiveLinkedList{T}(), Threads.SpinLock())
 end
-isempty(W::IntrusiveLinkedListSynchronized) = isempty(W.queue)
-length(W::IntrusiveLinkedListSynchronized) = length(W.queue)
+waitqueue(l::IntrusiveLinkedListSynchronized) = ILLRef(l.queue, l)
+isempty(W::IntrusiveLinkedListSynchronized) = isempty(waitqueue(W))
+length(W::IntrusiveLinkedListSynchronized) = length(waitqueue(W))
 function push!(W::IntrusiveLinkedListSynchronized{T}, t::T) where T
     lock(W.lock)
     try
-        push!(W.queue, t)
+        push!(waitqueue(W), t)
     finally
         unlock(W.lock)
     end
@@ -913,7 +934,7 @@ end
 function pushfirst!(W::IntrusiveLinkedListSynchronized{T}, t::T) where T
     lock(W.lock)
     try
-        pushfirst!(W.queue, t)
+        pushfirst!(waitqueue(W), t)
     finally
         unlock(W.lock)
     end
@@ -922,7 +943,7 @@ end
 function pop!(W::IntrusiveLinkedListSynchronized)
     lock(W.lock)
     try
-        return pop!(W.queue)
+        return pop!(waitqueue(W))
     finally
         unlock(W.lock)
     end
@@ -930,7 +951,7 @@ end
 function popfirst!(W::IntrusiveLinkedListSynchronized)
     lock(W.lock)
     try
-        return popfirst!(W.queue)
+        return popfirst!(waitqueue(W))
     finally
         unlock(W.lock)
     end
@@ -938,7 +959,7 @@ end
 function list_deletefirst!(W::IntrusiveLinkedListSynchronized{T}, t::T) where T
     lock(W.lock)
     try
-        list_deletefirst!(W.queue, t)
+        list_deletefirst!(waitqueue(W), t)
     finally
         unlock(W.lock)
     end
@@ -953,7 +974,10 @@ workqueue_for(tid::Int) = Workqueues[tid]
 
 function enq_work(t::Task)
     (t._state === task_state_runnable && t.queue === nothing) || error("schedule: Task not runnable")
+    _enq_work(t)
+end
 
+function _enq_work(t::Task)
     # Sticky tasks go into their thread's work queue.
     if t.sticky
         tid = Threads.threadid(t)
@@ -1272,3 +1296,174 @@ function maybe_record_enqueued!(t::Task)
     end
     return t
 end
+
+## Cancellation
+
+struct CancellationRequest
+    request::UInt8
+end
+
+"""
+	CANCEL_REQUEST_SAFE
+
+Request safe cancelation of the current task. If the task is waiting for any
+other resources, it will request safe cancellation of any such resources and
+wait for the cancellation of such resources to be completed.
+
+As a result, if either the task itself or any of its dependent resources are
+currently unable to process cancelation, the request may hang and a more
+aggressive cancelation method may be required. However, in general _SAFE
+should be tried first.
+"""
+const CANCEL_REQUEST_SAFE = CancellationRequest(0x0)
+
+"""
+	CANCEL_REQUEST_ACK
+
+Set by the task itself to indicate that a (safe) cancellation request was
+received and acknowledged, but that there are dependent tasks for whom
+cancelation is still pending.
+"""
+const CANCEL_REQUEST_ACK = CancellationRequest(0x1)
+
+"""
+	CANCEL_REQUEST_QUERY
+
+Request that the system create an asynchronous report of why the task is currently
+not able to be canceled. The report will be provided in the ->cancelation_request
+field of the current task (as long as this field is still CANCEL_REQUEST_QUERY).
+
+N.B.: Transition to CANCEL_REQUEST_QUERY is only allowed from CANCEL_REQUEST_ACK.
+	  Once the waiting task has read the cancelation report, it may set the cancelation
+	  request back to CANCEL_REQUEST_ACK.
+"""
+const CANCEL_REQUEST_QUERY = CancellationRequest(0x2)
+
+"""
+	CANCEL_REQUEST_ABANDON_EXTERNAL
+
+Request a cancelation that will cease waiting for any external resources (e.g. I/O objects)
+without going through a safe cancelation procedure for such resources. However, the
+task will wait for any internal computational tasks to complete cancelation.
+
+This is a middleground between CANCEL_REQUEST_SAFE and CANCEL_REQUEST_ABANDON_ALL. As external
+I/O is often engineered for robustness in case of sudden disapperance of peers
+"""
+const CANCEL_REQUEST_ABANDON_EXTERNAL = CancellationRequest(0x3)
+
+"""
+	CANCEL_REQUEST_ABANDON_ALL
+
+Request a cancelation that will cease waiting for all external resources and all unacknowledged
+internal tasks. Such tasks will be frozen and become unschedulable in the future.
+
+!!! warning
+	If any canceled task has acquired locks or other resources that are contested, this method of
+	cancelation may leak such resources and create deadlocks in future code. It is intended as a
+	last-resort method to recover a system, but the necessity of this operation should in general
+	be considered a bug (e.g. due to insufficient cancellation points in computationally-heavy code).
+"""
+const CANCEL_REQUEST_ABANDON_ALL = CancellationRequest(0x4)
+
+function Base.showerror(io::IO, cr::CancellationRequest)
+    print(io, "CancellationRequest: ")
+    if cr === CANCEL_REQUEST_SAFE
+        print(io, "Safe Cancellation (CANCEL_REQUEST_SAFE)")
+    else
+        print(io, "Unknown ($(cr.request))")
+    end
+end
+
+function conform_cancellation_request(@nospecialize(cr))
+    if isa(cr, UInt8)
+        return CancellationRequest(cr)
+    end
+    return cr
+end
+
+"""
+    cancellation_request()
+
+Returns the cancellation request for the current task or `nothing` if no
+cancellation has been requested. If a cancellation request is present, it is
+loaded with acquire semantics.
+"""
+function cancellation_request()
+    ct = current_task()
+    req = @atomic :monotonic ct.cancellation_request
+    req === nothing && return req
+    cr = @atomic :acquire ct.cancellation_request
+    return conform_cancellation_request(cr)
+end
+
+"""
+    Core.cancellation_point!()
+
+Like [`cancellation_request`](@ref), but additionally gives the optimizer license
+to establish this point as a cancellation reset point. If safe to do, the runtime
+will attempt to unwind execution to the nearest preceeding cancellation point
+when a cancellation is requested.
+"""
+Core.cancellation_point!
+
+function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE)
+    @atomic :release t.cancellation_request = crequest
+    while !istaskdone(t)
+        waitee = t.queue
+        waitee === nothing && (yield(); continue)
+        invokelatest(cancel_wait!, waitee, t) && break
+    end
+end
+
+"""
+    Base.reset_cancellation!()
+
+Resets the cancellation status of the current task.
+This should only be used from the root task after normal operation has been
+resumed (e.g. by returning control to the user).
+"""
+function reset_cancellation!()
+    ct = current_task()
+    @assert ct === roottask
+    @atomic :release ct.cancellation_request = nothing
+end
+
+function propagate_cancellation!(t::Task, crequest)
+    if crequest != CANCEL_REQUEST_SAFE
+        error("Not yet supported")
+    end
+    cancel!(t, crequest)
+    _wait(t; expected_cancellation=crequest)
+end
+
+@noinline function sync_cancel!(c::Channel{Any}, t::Task, @nospecialize(cr), c_ex::CompositeException)
+    if cr !== CANCEL_REQUEST_SAFE
+        error("Not yet supported")
+    end
+    waitees = Any[t]
+    cancel!(t, cr)
+    while isready(c)
+        r = take!(c)
+        cancel!(r, cr)
+        push!(waitees, r)
+    end
+    close(c)
+    for r in waitees
+        if isa(r, Task)
+            _wait(r; expected_cancellation=cr)
+            if istaskfailed(r)
+                push!(c_ex, TaskFailedException(r))
+            end
+        else
+            try
+                wait(r)
+            catch e
+                push!(c_ex, e)
+            end
+        end
+    end
+    if !isempty(c_ex)
+        throw(c_ex)
+    end
+    return nothing
+end
diff --git a/src/builtin_proto.h b/src/builtin_proto.h
index ff634149a06f6..ee0d8fca8cd8a 100644
--- a/src/builtin_proto.h
+++ b/src/builtin_proto.h
@@ -76,6 +76,7 @@ extern "C" {
     XX(tuple,"tuple") \
     XX(typeassert,"typeassert") \
     XX(typeof,"typeof") \
+    XX(cancellation_point,"cancellation_point!")
 
 #define DECLARE_BUILTIN(cname,jlname) \
     JL_CALLABLE(jl_f_##cname);
diff --git a/src/builtins.c b/src/builtins.c
index 7b29580c76086..80ca3f3c23a1e 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -2496,6 +2496,16 @@ JL_CALLABLE(jl_f_intrinsic_call)
     abort();
 }
 
+JL_CALLABLE(jl_f_cancellation_point)
+{
+    JL_NARGS(cancellation_point, 0, 0);
+    jl_task_t *ct = jl_current_task;
+    jl_value_t *cr = jl_atomic_load_relaxed(&ct->cancellation_request);
+    if (cr == NULL || cr == jl_nothing)
+        return jl_nothing;
+    return jl_atomic_load_acquire(&ct->cancellation_request);
+}
+
 JL_DLLEXPORT const char *jl_intrinsic_name(int f)
 {
     switch ((enum intrinsic)f) {
diff --git a/src/codegen.cpp b/src/codegen.cpp
index ed427f999e8e3..c193da2c436d6 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -1197,6 +1197,17 @@ static const auto jl_write_barrier_func = new JuliaFunction<>{
     },
 };
 
+static const auto jl_cancellation_point_func = new JuliaFunction<>{
+    "julia.cancellation_point",
+    [](LLVMContext &C) {
+        return FunctionType::get(getInt32Ty(C), {}, false);
+    },
+    [](LLVMContext &C) { return AttributeList::get(C,
+            Attributes(C, {Attribute::ReturnsTwice}),
+            AttributeSet(),
+            None); }
+};
+
 static const auto jlisa_func = new JuliaFunction<>{
     XSTR(jl_isa),
     [](LLVMContext &C) {
@@ -4943,6 +4954,51 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
         return true;
     }
 
+    else if (f == BUILTIN(cancellation_point) && nargs == 0) {
+        // Emit the cancellation point intrinsic call first
+        ctx.builder.CreateCall(prepare_call(jl_cancellation_point_func));
+
+        // Now do the same as the runtime version:
+        // 1. Load cancellation_request with relaxed ordering for fast path check
+        Value *ct = get_current_task(ctx);
+        Value *cr_ptr = emit_ptrgep(ctx, ct, offsetof(jl_task_t, cancellation_request), "cancellation_request");
+        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe);
+        LoadInst *cr_relaxed = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, cr_ptr, ctx.types().alignof_ptr);
+        cr_relaxed->setOrdering(AtomicOrdering::Monotonic);
+        ai.decorateInst(cr_relaxed);
+
+        // 2. Check if cr == NULL || cr == jl_nothing
+        Value *is_null = ctx.builder.CreateIsNull(cr_relaxed);
+        Value *nothing_val = literal_pointer_val(ctx, jl_nothing);
+        Value *is_nothing = ctx.builder.CreateICmpEQ(decay_derived(ctx, cr_relaxed), decay_derived(ctx, nothing_val));
+        Value *no_cancel = ctx.builder.CreateOr(is_null, is_nothing);
+
+        // Save current basic block before branching
+        BasicBlock *currBB = ctx.builder.GetInsertBlock();
+
+        // Create basic blocks for the branch
+        BasicBlock *has_cancel_bb = BasicBlock::Create(ctx.builder.getContext(), "has_cancellation", ctx.f);
+        BasicBlock *merge_bb = BasicBlock::Create(ctx.builder.getContext(), "cancellation_merge", ctx.f);
+
+        ctx.builder.CreateCondBr(no_cancel, merge_bb, has_cancel_bb);
+
+        // In the has_cancel case, do an acquire load
+        ctx.builder.SetInsertPoint(has_cancel_bb);
+        LoadInst *cr_acquire = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, cr_ptr, ctx.types().alignof_ptr);
+        cr_acquire->setOrdering(AtomicOrdering::Acquire);
+        ai.decorateInst(cr_acquire);
+        ctx.builder.CreateBr(merge_bb);
+
+        // Merge the results
+        ctx.builder.SetInsertPoint(merge_bb);
+        PHINode *result = ctx.builder.CreatePHI(ctx.types().T_prjlvalue, 2);
+        result->addIncoming(nothing_val, currBB);
+        result->addIncoming(cr_acquire, has_cancel_bb);
+
+        *ret = mark_julia_type(ctx, result, /*boxed*/ true, rt);
+        return true;
+    }
+
     return false;
 }
 
diff --git a/src/jltypes.c b/src/jltypes.c
index db75be1c9db0a..bb5a0beb98494 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -3767,7 +3767,7 @@ void jl_init_types(void) JL_GC_DISABLED
                         NULL,
                         jl_any_type,
                         jl_emptysvec,
-                        jl_perm_symsvec(27,
+                        jl_perm_symsvec(28,
                                         "next",
                                         "queue",
                                         "storage",
@@ -3794,8 +3794,9 @@ void jl_init_types(void) JL_GC_DISABLED
                                         "first_enqueued_at",
                                         "last_started_running_at",
                                         "running_time_ns",
-                                        "finished_at"),
-                        jl_svec(27,
+                                        "finished_at",
+                                        "cancellation_request"),
+                        jl_svec(28,
                                 jl_any_type,
                                 jl_any_type,
                                 jl_any_type,
@@ -3822,16 +3823,17 @@ void jl_init_types(void) JL_GC_DISABLED
                                 jl_uint64_type,
                                 jl_uint64_type,
                                 jl_uint64_type,
-                                jl_uint64_type),
+                                jl_uint64_type,
+                                jl_any_type),
                         jl_emptysvec,
                         0, 1, 6);
     XX(task);
     jl_value_t *listt = jl_new_struct(jl_uniontype_type, jl_task_type, jl_nothing_type);
     jl_svecset(jl_task_type->types, 0, listt);
     // Set field 20 (metrics_enabled) as const
-    // Set fields 8 (_state) and 24-27 (metric counters) as atomic
+    // Set fields 8 (_state) and 24-27 (metric counters), 28 (cancellation_request) as atomic
     const static uint32_t task_constfields[1]  = { 0b00000000000010000000000000000000 };
-    const static uint32_t task_atomicfields[1] = { 0b00000111100000000000000010000000 };
+    const static uint32_t task_atomicfields[1] = { 0b00001111100000000000000010000000 };
     jl_task_type->name->constfields = task_constfields;
     jl_task_type->name->atomicfields = task_atomicfields;
 
diff --git a/src/julia_locks.h b/src/julia_locks.h
index a4b5fd96b8fb4..4fe75543ad6b9 100644
--- a/src/julia_locks.h
+++ b/src/julia_locks.h
@@ -104,6 +104,7 @@ static inline void jl_mutex_init(jl_mutex_t *lock, const char *name) JL_NOTSAFEP
 #define JL_LOCK(m) jl_mutex_lock(m)
 #define JL_UNLOCK(m) jl_mutex_unlock(m)
 #define JL_LOCK_NOGC(m) jl_mutex_lock_nogc(m)
+#define JL_TRYLOCK_NOGC(m) jl_mutex_trylock_nogc(m)
 #define JL_UNLOCK_NOGC(m) jl_mutex_unlock_nogc(m)
 
 JL_DLLEXPORT void jl_lock_value(jl_mutex_t *v) JL_NOTSAFEPOINT;
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 364931e43d2e9..df6a095baf4ec 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -225,8 +225,11 @@ typedef struct _jl_handler_t jl_handler_t;
 
 typedef struct _jl_task_t {
     JL_DATA_TYPE
-    jl_value_t *next; // invasive linked list for scheduler
-    jl_value_t *queue; // invasive linked list for scheduler
+    // This invasive linked list is used by the scheduler. The fields are protected
+    // by the lock of the parent object of the containing queue.
+    jl_value_t *next;
+    jl_value_t *queue;
+
     jl_value_t *tls;
     jl_value_t *donenotify;
     jl_value_t *result;
@@ -252,6 +255,9 @@ typedef struct _jl_task_t {
     // timestamp this task finished (i.e. entered state DONE or FAILED).
     _Atomic(uint64_t) finished_at;
 
+    // Cancellation request - can be an arbitary julia value, but the runtime recognizes
+    // CANCEL_REQUEST_ enum values.
+    _Atomic(jl_value_t *) cancellation_request;
 // hidden state:
 
     // id of owning thread - does not need to be defined until the task runs
@@ -280,6 +286,9 @@ typedef struct _jl_task_t {
     jl_handler_t *eh;
     // saved thread state
     jl_ucontext_t ctx; // pointer into stkbuf, if suspended
+    // current reset point for cancellation. Technically, we only need volatile
+    // here, but _Atomic makes the intent clearer.
+    volatile _jl_ucontext_t *reset_ctx;
 } jl_task_t;
 
 JL_DLLEXPORT void *jl_get_ptls_states(void);
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index ae1351ae41ca1..90334e60251da 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2068,7 +2068,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
             }
             Value *callee = CI->getCalledOperand();
             if (callee && (callee == gc_flush_func || callee == gc_preserve_begin_func
-                        || callee == gc_preserve_end_func)) {
+                        || callee == gc_preserve_end_func || callee == cancel_point_func)) {
                 /* No replacement */
             } else if (pointer_from_objref_func != nullptr && callee == pointer_from_objref_func) {
                 auto *obj = CI->getOperand(0);
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index a6a16a7f4956c..ed288328ea908 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -32,7 +32,7 @@ JuliaPassContext::JuliaPassContext()
         gc_preserve_begin_func(nullptr), gc_preserve_end_func(nullptr),
         pointer_from_objref_func(nullptr), gc_loaded_func(nullptr), alloc_obj_func(nullptr),
         typeof_func(nullptr), write_barrier_func(nullptr), pop_handler_noexcept_func(nullptr),
-        call_func(nullptr), call2_func(nullptr), call3_func(nullptr), module(nullptr)
+        call_func(nullptr), call2_func(nullptr), call3_func(nullptr), cancel_point_func(nullptr), module(nullptr)
 {
 }
 
@@ -61,6 +61,7 @@ void JuliaPassContext::initFunctions(Module &M)
     call_func = M.getFunction("julia.call");
     call2_func = M.getFunction("julia.call2");
     call3_func = M.getFunction("julia.call3");
+    cancel_point_func = M.getFunction("julia.cancellation_point");
 }
 
 void JuliaPassContext::initAll(Module &M)
diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h
index d79470818c287..d9c96e6451595 100644
--- a/src/llvm-pass-helpers.h
+++ b/src/llvm-pass-helpers.h
@@ -64,6 +64,7 @@ struct JuliaPassContext {
     llvm::Function *call_func;
     llvm::Function *call2_func;
     llvm::Function *call3_func;
+    llvm::Function *cancel_point_func;
 
     // Creates a pass context. Type and function pointers
     // are set to `nullptr`. Metadata nodes are initialized.
diff --git a/src/signal-handling.c b/src/signal-handling.c
index 1da687654dd81..39eb3ceedd53b 100644
--- a/src/signal-handling.c
+++ b/src/signal-handling.c
@@ -441,6 +441,24 @@ static void jl_check_profile_autostop(void)
     }
 }
 
+jl_mutex_t sigint_cond_lock;
+static uv_async_t *sigint_cond_loc;
+JL_DLLEXPORT void jl_set_sigint_cond(uv_async_t *cond)
+{
+    JL_LOCK_NOGC(&sigint_cond_lock);
+    sigint_cond_loc = cond;
+    JL_UNLOCK_NOGC(&sigint_cond_lock);
+}
+
+static void deliver_sigint_notification(void)
+{
+    if (JL_TRYLOCK_NOGC(&sigint_cond_lock)) {
+        if (sigint_cond_loc != NULL)
+            uv_async_send(sigint_cond_loc);
+        JL_UNLOCK_NOGC(&sigint_cond_lock);
+    }
+}
+
 static void stack_overflow_warning(void)
 {
     jl_safe_printf("Warning: detected a stack overflow; program state may be corrupted, so further execution might be unreliable.\n");
diff --git a/src/signals-unix.c b/src/signals-unix.c
index e62f9d738d8ba..ec34d97cd89bc 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -418,7 +418,6 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context)
         }
         else if (jl_safepoint_consume_sigint()) {
             jl_clear_force_sigint();
-            jl_throw_in_ctx(ct, jl_interrupt_exception, sig, context);
         }
         return;
     }
@@ -989,14 +988,20 @@ static void *signal_listener(void *arg)
 #endif
 
         if (sig == SIGINT) {
-            if (jl_ignore_sigint()) {
-                continue;
-            }
-            else if (exit_on_sigint) {
+            if (exit_on_sigint) {
                 critical = 1;
             }
             else {
-                jl_try_deliver_sigint();
+                jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
+                // Set the cancellation request, then notify the sigint listener
+                // that we want to cancel - if the task is not currently running,
+                // the sigint listener will take care of safely moving us through
+                // the cancellation state machine.
+                // TODO: If there is only one thread, we may need to ask the currently
+                // running task to yield, so that the sigint listener can run.
+                jl_atomic_store_release(&ptls2->root_task->cancellation_request,
+                    jl_box_uint8(0x00));
+                deliver_sigint_notification();
                 continue;
             }
         }
diff --git a/src/task.c b/src/task.c
index 18d21b2343053..68ede8ff55e90 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1151,6 +1151,7 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_value_t *start, jl_value_t *completion_fu
     jl_atomic_store_relaxed(&t->running_time_ns, 0);
     jl_atomic_store_relaxed(&t->finished_at, 0);
     jl_timing_task_init(t);
+    jl_atomic_store_relaxed(&t->cancellation_request, jl_nothing);
 
     if (t->ctx.copy_stack)
         t->ctx.copy_ctx = NULL;
@@ -1603,6 +1604,7 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi)
         jl_atomic_store_relaxed(&ct->first_enqueued_at, 0);
         jl_atomic_store_relaxed(&ct->last_started_running_at, 0);
     }
+    jl_atomic_store_relaxed(&ct->cancellation_request, jl_nothing);
     ptls->root_task = ct;
     jl_atomic_store_relaxed(&ptls->current_task, ct);
     JL_GC_PROMISE_ROOTED(ct);
diff --git a/stdlib/REPL/src/REPL.jl b/stdlib/REPL/src/REPL.jl
index 0c31315e9bea1..2eb8168b47d04 100644
--- a/stdlib/REPL/src/REPL.jl
+++ b/stdlib/REPL/src/REPL.jl
@@ -451,6 +451,10 @@ function repl_backend_loop(backend::REPLBackend, get_module::Function)
     while true
         tls = task_local_storage()
         tls[:SOURCE_PATH] = nothing
+        # TODO: Support cancellation scopes for non-root tasks
+        if current_task() === Base.roottask
+            Base.reset_cancellation!()
+        end
         ast_or_func, show_value = take!(backend.repl_channel)
         if show_value == -1
             # exit flag

From 29f503b43ff97c35bd927caf20a34ada774dae4f Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliahub.com>
Date: Wed, 3 Dec 2025 01:09:34 +0000
Subject: [PATCH 4/9] WIP

---
 Compiler/src/effects.jl    |  53 ++++++++++++-----
 Compiler/src/ssair/show.jl |   4 +-
 base/asyncevent.jl         |  10 +++-
 base/condition.jl          |  36 +++++++-----
 base/experimental.jl       |   2 +-
 base/linked_list.jl        |  14 ++++-
 base/task.jl               | 117 ++++++++++++++++++++++++++++++++-----
 src/codegen.cpp            |   2 +-
 src/signal-handling.c      |   7 ++-
 src/task.c                 |  24 ++++++++
 test/cancellation.jl       |  78 +++++++++++++++++++++++++
 11 files changed, 297 insertions(+), 50 deletions(-)
 create mode 100644 test/cancellation.jl

diff --git a/Compiler/src/effects.jl b/Compiler/src/effects.jl
index 3db778943f10a..4879a9981524a 100644
--- a/Compiler/src/effects.jl
+++ b/Compiler/src/effects.jl
@@ -11,28 +11,32 @@ The output represents the state of different effect properties in the following
     - `+e` (green): `ALWAYS_TRUE`
     - `-e` (red): `ALWAYS_FALSE`
     - `?e` (yellow): `EFFECT_FREE_IF_INACCESSIBLEMEMONLY`
-3. `nothrow` (`n`):
+3. `reset_safe` (`re`):
+    - `+re` (green): `ALWAYS_TRUE`
+    - `-re` (red): `ALWAYS_FALSE`
+    - `?re` (yellow): `RESET_SAFE_IF_INACCESSIBLEMEMONLY`
+4. `nothrow` (`n`):
     - `+n` (green): `true`
     - `-n` (red): `false`
-4. `terminates` (`t`):
+5. `terminates` (`t`):
     - `+t` (green): `true`
     - `-t` (red): `false`
-5. `notaskstate` (`s`):
+6. `notaskstate` (`s`):
     - `+s` (green): `true`
     - `-s` (red): `false`
-6. `inaccessiblememonly` (`m`):
+7. `inaccessiblememonly` (`m`):
     - `+m` (green): `ALWAYS_TRUE`
     - `-m` (red): `ALWAYS_FALSE`
     - `?m` (yellow): `INACCESSIBLEMEM_OR_ARGMEMONLY`
-7. `noub` (`u`):
+8. `noub` (`u`):
     - `+u` (green): `true`
     - `-u` (red): `false`
     - `?u` (yellow): `NOUB_IF_NOINBOUNDS`
-8. `:nonoverlayed` (`o`):
+9. `:nonoverlayed` (`o`):
     - `+o` (green): `ALWAYS_TRUE`
     - `-o` (red): `ALWAYS_FALSE`
     - `?o` (yellow): `CONSISTENT_OVERLAY`
-9. `:nortcall` (`r`):
+10. `:nortcall` (`r`):
     - `+r` (green): `true`
     - `-r` (red): `false`
 """
@@ -62,6 +66,10 @@ following meanings:
     will not be refined anyway.
   * `EFFECT_FREE_IF_INACCESSIBLEMEMONLY`: the `:effect-free`-ness of this method can later be
     refined to `ALWAYS_TRUE` in a case when `:inaccessiblememonly` is proven.
+- `reset_safe::UInt8`
+  * The execution of this function may be interrupted and reset to an earlier cancellation
+    point at any point in the function. The interpretation is similar to `:effect_free`,
+    but has different guarantees.
 - `nothrow::Bool`: this method is guaranteed to not throw an exception.
   If the execution of this method may raise `MethodError`s and similar exceptions, then
   the method is not considered as `:nothrow`.
@@ -119,6 +127,7 @@ $(effects_key_string)
 struct Effects
     consistent::UInt8
     effect_free::UInt8
+    reset_safe::UInt8
     nothrow::Bool
     terminates::Bool
     notaskstate::Bool
@@ -129,6 +138,7 @@ struct Effects
     function Effects(
         consistent::UInt8,
         effect_free::UInt8,
+        reset_safe::UInt8,
         nothrow::Bool,
         terminates::Bool,
         notaskstate::Bool,
@@ -139,6 +149,7 @@ struct Effects
         return new(
             consistent,
             effect_free,
+            reset_safe,
             nothrow,
             terminates,
             notaskstate,
@@ -175,14 +186,18 @@ const NOUB_IF_NOINBOUNDS = 0x01 << 1
 # :nonoverlayed bits
 const CONSISTENT_OVERLAY = 0x01 << 1
 
-const EFFECTS_TOTAL   = Effects(ALWAYS_TRUE,  ALWAYS_TRUE,  true,  true,  true,  ALWAYS_TRUE,  ALWAYS_TRUE,  ALWAYS_TRUE, true)
-const EFFECTS_THROWS  = Effects(ALWAYS_TRUE,  ALWAYS_TRUE,  false, true,  true,  ALWAYS_TRUE,  ALWAYS_TRUE,  ALWAYS_TRUE, true)
-const EFFECTS_UNKNOWN = Effects(ALWAYS_FALSE, ALWAYS_FALSE, false, false, false, ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_TRUE, false) # unknown mostly, but it's not overlayed at least (e.g. it's not a call)
+# :reset_safe bits
+const RESET_SAFE_IF_INACCESSIBLEMEMONLY = 0x01 << 1
 
-function Effects(effects::Effects=Effects(
-    ALWAYS_FALSE, ALWAYS_FALSE, false, false, false, ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_FALSE, false);
+const EFFECTS_TOTAL   = Effects(ALWAYS_TRUE,  ALWAYS_TRUE,  ALWAYS_TRUE, true,  true,  true,  ALWAYS_TRUE,  ALWAYS_TRUE,  ALWAYS_TRUE, true)
+const EFFECTS_THROWS  = Effects(ALWAYS_TRUE,  ALWAYS_TRUE,  ALWAYS_TRUE, false, true,  true,  ALWAYS_TRUE,  ALWAYS_TRUE,  ALWAYS_TRUE, true)
+const EFFECTS_UNKNOWN = Effects(ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_FALSE, false, false, false, ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_TRUE, false) # unknown mostly, but it's not overlayed at least (e.g. it's not a call)
+const EFFECTS_MINIMAL = Effects(ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_FALSE, false, false, false, ALWAYS_FALSE, ALWAYS_FALSE, ALWAYS_FALSE, false)
+
+function Effects(effects::Effects=EFFECTS_MINIMAL;
     consistent::UInt8 = effects.consistent,
     effect_free::UInt8 = effects.effect_free,
+    reset_safe::UInt8 = effects.reset_safe,
     nothrow::Bool = effects.nothrow,
     terminates::Bool = effects.terminates,
     notaskstate::Bool = effects.notaskstate,
@@ -193,6 +208,7 @@ function Effects(effects::Effects=Effects(
     return Effects(
         consistent,
         effect_free,
+        reset_safe,
         nothrow,
         terminates,
         notaskstate,
@@ -225,6 +241,14 @@ function is_better_effects(new::Effects, old::Effects)
     elseif new.effect_free != old.effect_free
         return false
     end
+    if new.reset_safe == ALWAYS_TRUE
+        any_improved |= old.reset_safe != ALWAYS_TRUE
+    elseif new.reset_safe == RESET_SAFE_IF_INACCESSIBLEMEMONLY
+        old.reset_safe == ALWAYS_TRUE && return false
+        any_improved |= old.reset_safe != RESET_SAFE_IF_INACCESSIBLEMEMONLY
+    elseif new.reset_safe != old.reset_safe
+        return false
+    end
     if new.nothrow
         any_improved |= !old.nothrow
     elseif new.nothrow != old.nothrow
@@ -276,6 +300,7 @@ function merge_effects(old::Effects, new::Effects)
     return Effects(
         merge_effectbits(old.consistent, new.consistent),
         merge_effectbits(old.effect_free, new.effect_free),
+        merge_effectbits(old.reset_safe, new.reset_safe),
         merge_effectbits(old.nothrow, new.nothrow),
         merge_effectbits(old.terminates, new.terminates),
         merge_effectbits(old.notaskstate, new.notaskstate),
@@ -345,13 +370,15 @@ function encode_effects(e::Effects)
            ((e.inaccessiblememonly % UInt32) << 8)  |
            ((e.noub                % UInt32) << 10) |
            ((e.nonoverlayed        % UInt32) << 12) |
-           ((e.nortcall            % UInt32) << 14)
+           ((e.nortcall            % UInt32) << 14) |
+           ((e.reset_safe          % UInt32) << 15)
 end
 
 function decode_effects(e::UInt32)
     return Effects(
         UInt8((e >> 0) & 0x07),
         UInt8((e >> 3) & 0x03),
+        UInt8((e >> 15) & 0x03),
         Bool((e >> 5) & 0x01),
         Bool((e >> 6) & 0x01),
         Bool((e >> 7) & 0x01),
diff --git a/Compiler/src/ssair/show.jl b/Compiler/src/ssair/show.jl
index 2947a381be959..e8e732ec73af6 100644
--- a/Compiler/src/ssair/show.jl
+++ b/Compiler/src/ssair/show.jl
@@ -1062,7 +1062,7 @@ function show_ir(io::IO, compact::IncrementalCompact, config::IRShowConfig=defau
     finish_show_ir(io, uncompacted_cfg, config)
 end
 
-function effectbits_letter(effects::Effects, name::Symbol, suffix::Char)
+function effectbits_letter(effects::Effects, name::Symbol, suffix::Union{Char, String})
     ft = fieldtype(Effects, name)
     if ft === UInt8
         prefix = getfield(effects, name) === ALWAYS_TRUE ? '+' :
@@ -1094,6 +1094,8 @@ function Base.show(io::IO, e::Effects)
     print(io, ',')
     printstyled(io, effectbits_letter(e, :effect_free, 'e'); color=effectbits_color(e, :effect_free))
     print(io, ',')
+    printstyled(io, effectbits_letter(e, :reset_safe, "re"); color=effectbits_color(e, :reset_safe))
+    print(io, ',')
     printstyled(io, effectbits_letter(e, :nothrow,     'n'); color=effectbits_color(e, :nothrow))
     print(io, ',')
     printstyled(io, effectbits_letter(e, :terminates,  't'); color=effectbits_color(e, :terminates))
diff --git a/base/asyncevent.jl b/base/asyncevent.jl
index fa9bb520cc855..6ac8e79b63ff8 100644
--- a/base/asyncevent.jl
+++ b/base/asyncevent.jl
@@ -183,7 +183,7 @@ function _trywait(t::Union{Timer, AsyncCondition})
                 set = t.set
                 if !set && t.handle != C_NULL # wait for set or handle, but not the isopen flag
                     iolock_end()
-                    set = wait(t.cond)
+                    set = wait(t.cond; waitee=t)
                     unlock(t.cond)
                     iolock_begin()
                     lock(t.cond)
@@ -192,7 +192,6 @@ function _trywait(t::Union{Timer, AsyncCondition})
                 unlock(t.cond)
                 unpreserve_handle(t)
             end
-            @cancel_check()
         end
         iolock_end()
     end
@@ -200,8 +199,13 @@ function _trywait(t::Union{Timer, AsyncCondition})
     return set
 end
 
+cancel_wait!(t::Union{Timer, AsyncCondition}, task::Task) =
+    cancel_wait!(t.cond, task, false; waitee=t)
+
 function wait(t::Union{Timer, AsyncCondition})
-    _trywait(t) || throw(EOFError())
+    ok = _trywait(t)
+    @cancel_check
+    ok || throw(EOFError())
     nothing
 end
 
diff --git a/base/condition.jl b/base/condition.jl
index fff9943966268..b4aead93faba3 100644
--- a/base/condition.jl
+++ b/base/condition.jl
@@ -82,13 +82,13 @@ islocked(c::GenericCondition) = islocked(c.lock)
 lock(f, c::GenericCondition) = lock(f, c.lock)
 
 # have waiter wait for c
-function _wait2(c::GenericCondition, waiter::Task, first::Bool=false)
+function _wait2(c::GenericCondition, waiter::Task, waitee=c, first::Bool=false)
     ct = current_task()
     assert_havelock(c)
     if first
-        pushfirst!(waitqueue(c), waiter)
+        pushfirst!(ILLRef(waitqueue(c), waitee), waiter)
     else
-        push!(waitqueue(c), waiter)
+        push!(ILLRef(waitqueue(c), waitee), waiter)
     end
     # since _wait2 is similar to schedule, we should observe the sticky bit now
     if waiter.sticky && Threads.threadid(waiter) == 0 && !GC.in_finalizer()
@@ -130,9 +130,7 @@ function wait end
 macro cancel_check()
     quote
         local req = Core.cancellation_point!()
-        if req !== nothing
-            throw(conform_cancellation_request(req))
-        end
+        req !== nothing && handle_cancellation!(req)
     end
 end
 
@@ -144,9 +142,9 @@ Wait for [`notify`](@ref) on `c` and return the `val` parameter passed to `notif
 If the keyword `first` is set to `true`, the waiter will be put _first_
 in line to wake up on `notify`. Otherwise, `wait` has first-in-first-out (FIFO) behavior.
 """
-function wait(c::GenericCondition; first::Bool=false, cancel_check::Bool=true)
+function wait(c::GenericCondition; first::Bool=false, waitee=c)
     ct = current_task()
-    _wait2(c, ct, first)
+    _wait2(c, ct, waitee, first)
     token = unlockall(c.lock)
     try
         return wait()
@@ -156,18 +154,30 @@ function wait(c::GenericCondition; first::Bool=false, cancel_check::Bool=true)
     finally
         relockall(c.lock, token)
     end
-    cancel_check && @cancel_check()
 end
 
-function cancel_wait!(c::GenericCondition, t::Task)
+function cancel_wait!(c::GenericCondition, t::Task; waitee = c)
     @assert (@atomic :monotonic t.cancellation_request) !== nothing
     lock(c)
-    if t.queue !== c
+    if t.queue !== waitee
         unlock(c)
         return false
     end
-    Base.list_deletefirst!(waitqueue(c), t)
-    enq_work(t)
+    Base.list_deletefirst!(ILLRef(waitqueue(c), waitee), t)
+    schedule(t, conform_cancellation_request(t.cancellation_request), error=true)
+    unlock(c)
+    return true
+end
+
+function cancel_wait!(c::GenericCondition, t::Task, @nospecialize(val); waitee=c)
+    @assert (@atomic :monotonic t.cancellation_request) !== nothing
+    lock(c)
+    if t.queue !== waitee
+        unlock(c)
+        return false
+    end
+    Base.list_deletefirst!(ILLRef(waitqueue(c), waitee), t)
+    schedule(t, val)
     unlock(c)
     return true
 end
diff --git a/base/experimental.jl b/base/experimental.jl
index 2deb3bc76af6c..f722baf16dc21 100644
--- a/base/experimental.jl
+++ b/base/experimental.jl
@@ -646,7 +646,7 @@ millisecond.
 """
 function wait_with_timeout(c::GenericCondition; first::Bool=false, timeout::Real=0.0)
     ct = current_task()
-    Base._wait2(c, ct, first)
+    Base._wait2(c, ct, c, first)
     token = Base.unlockall(c.lock)
 
     timer::Union{Timer, Nothing} = nothing
diff --git a/base/linked_list.jl b/base/linked_list.jl
index cf3be4115f73d..9df72622e7957 100644
--- a/base/linked_list.jl
+++ b/base/linked_list.jl
@@ -9,8 +9,9 @@ end
 
 struct ILLRef{T}
     list::IntrusiveLinkedList{T}
-    waitee::Any # Invariant: waitqueue(waitee) === list
+    waitee::Any # Invariant: waitqueue(waitee).list === list
 end
+ILLRef(ref::ILLRef, @nospecialize(waitee)) = typeof(ref)(ref.list, waitee)
 waitqueue(list::IntrusiveLinkedList{T}) where {T} = ILLRef(list, list)
 
 #const list_append!! = append!
@@ -128,6 +129,17 @@ function list_deletefirst!(qr::ILLRef{T}, val::T) where T
     return q
 end
 
+function in(val::T, list::IntrusiveLinkedList{T}) where T
+    head = list.head
+    while head !== nothing
+        if val === head
+            return true
+        end
+        head = head.next
+    end
+    return false
+end
+
 # TODO: Delete this compatibility wrapper
 list_deletefirst!(q::IntrusiveLinkedList{T}, val::T) where T = list_deletefirst!(ILLRef(q, q), val)
 push!(q::IntrusiveLinkedList{T}, val::T) where T = push!(ILLRef(q, q), val)
diff --git a/base/task.jl b/base/task.jl
index 077d571a5fb6f..ae1f9f4b3ff74 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -145,9 +145,11 @@ end
 
 # task states
 
-const task_state_runnable = UInt8(0)
-const task_state_done     = UInt8(1)
-const task_state_failed   = UInt8(2)
+const task_state_runnable  = UInt8(0)
+const task_state_done      = UInt8(1)
+const task_state_failed    = UInt8(2)
+# like _failed, but allows schedule to succeed
+const task_state_cancelled = UInt8(3)
 
 @inline function getproperty(t::Task, field::Symbol)
     if field === :state
@@ -159,6 +161,8 @@ const task_state_failed   = UInt8(2)
             return :done
         elseif st === task_state_failed
             return :failed
+        elseif st === task_state_cancelled
+            return :cancelled
         else
             @assert false
         end
@@ -250,7 +254,10 @@ true
 !!! compat "Julia 1.3"
     This function requires at least Julia 1.3.
 """
-istaskfailed(t::Task) = ((@atomic :acquire t._state) === task_state_failed)
+function istaskfailed(t::Task)
+    state = (@atomic :acquire t._state)
+    return state === task_state_failed || state === task_state_cancelled
+end
 
 Threads.threadid(t::Task) = Int(ccall(:jl_get_task_tid, Int16, (Any,), t)+1)
 function Threads.threadpool(t::Task)
@@ -309,7 +316,7 @@ function _wait(t::Task; expected_cancellation = nothing)
         lock(donenotify)
         try
             while !istaskdone(t) && cancellation_request() === expected_cancellation
-                wait(donenotify; cancel_check=false)
+                wait(donenotify; waitee=t)
             end
         finally
             unlock(donenotify)
@@ -317,6 +324,7 @@ function _wait(t::Task; expected_cancellation = nothing)
     end
     nothing
 end
+cancel_wait!(waitee::Task, waiter::Task) = cancel_wait!(waitee.donenotify, waiter, nothing; waitee)
 
 # have `waiter` wait for `t`
 function _wait2(t::Task, waiter::Task)
@@ -973,7 +981,17 @@ const Workqueue = Workqueues[1] # default work queue is thread 1 // TODO: deprec
 workqueue_for(tid::Int) = Workqueues[tid]
 
 function enq_work(t::Task)
-    (t._state === task_state_runnable && t.queue === nothing) || error("schedule: Task not runnable")
+    state = t._state
+    if state === task_state_cancelled
+        # When canelled, we allow `enq_work`, but simply transition to failed state.
+        # All other task cleanup is already done.
+        state = (@atomicreplace t._state task_state_cancelled => task_state_failed).old
+        # Catch double `schedule` calls on cancelled tasks.
+        state === task_state_cancelled && return
+    end
+    if !(state === task_state_runnable && t.queue === nothing)
+        error("schedule: Task not runnable")
+    end
     _enq_work(t)
 end
 
@@ -1066,18 +1084,19 @@ true
 """
 function schedule(t::Task, @nospecialize(arg); error=false)
     # schedule a task to be (re)started with the given value or exception
-    t._state === task_state_runnable || Base.error("schedule: Task not runnable")
-    if error
-        q = t.queue; q === nothing || list_deletefirst!(q::IntrusiveLinkedList{Task}, t)
-        setfield!(t, :result, arg)
-        setfield!(t, :_isexception, true)
-    else
-        t.queue === nothing || Base.error("schedule: Task not runnable")
-        setfield!(t, :result, arg)
+    state = t._state
+    if t._state === task_state_runnable
+        if error
+            q = t.queue; q === nothing || list_deletefirst!(q::IntrusiveLinkedList{Task}, t)
+            setfield!(t, :result, arg)
+            setfield!(t, :_isexception, true)
+        else
+            t.queue === nothing || Base.error("schedule: Task not runnable")
+            setfield!(t, :result, arg)
+        end
     end
     # [task] created -scheduled-> wait_time
-    maybe_record_enqueued!(t)
-    enq_work(t)
+    schedule(t)
     return t
 end
 
@@ -1365,6 +1384,17 @@ internal tasks. Such tasks will be frozen and become unschedulable in the future
 """
 const CANCEL_REQUEST_ABANDON_ALL = CancellationRequest(0x4)
 
+"""
+	CANCEL_REQUEST_YIELD
+
+Request that the task yield to the scheduler at the next cancellation point to
+allow another task to run its cancellation propagation logic. The cancelled task
+itself will reset to ordinary operation before yielding, but may of course be
+canceled by said other task before it resumes operation.
+"""
+const CANCEL_REQUEST_YIELD = CancellationRequest(0x5)
+
+
 function Base.showerror(io::IO, cr::CancellationRequest)
     print(io, "CancellationRequest: ")
     if cr === CANCEL_REQUEST_SAFE
@@ -1381,6 +1411,18 @@ function conform_cancellation_request(@nospecialize(cr))
     return cr
 end
 
+# This is the slow path of @cancel_check
+@noinline function handle_cancellation!(@nospecialize(_req))
+    req = conform_cancellation_request(_req)
+    if req === CANCEL_REQUEST_YIELD
+        @atomicreplace :sequentially_consistent :monotonic current_task().cancellation_request _req => nothing
+        yield()
+        req = cancellation_request()
+    end
+    req === nothing && return
+    throw(req)
+end
+
 """
     cancellation_request()
 
@@ -1407,12 +1449,55 @@ when a cancellation is requested.
 Core.cancellation_point!
 
 function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE)
+    # TODO: Raise task priority
     @atomic :release t.cancellation_request = crequest
+    # TODO: SYS_membarrier() ?
+    # Special case: If the task hasn't started yet at this point, we want to set
+    # it up to cancel any waits, but we need to be a bit careful with concurrent
+    # starts of the task.
+    if !istaskstarted(t)
+        t.result = crequest
+        t._isexception = true
+        if (@atomicreplace :sequentially_consistent :monotonic t._state task_state_runnable => task_state_cancelled).success
+            lock(t.donenotify)
+            notify(t.donenotify)
+            unlock(t.donenotify)
+        end
+        return
+    end
     while !istaskdone(t)
         waitee = t.queue
         waitee === nothing && (yield(); continue)
         invokelatest(cancel_wait!, waitee, t) && break
     end
+    if t.sticky
+        # If this task is sticky, it won't be able to run if the task currently
+        # running on its thread is blocking. Use the cancellation mechanism to
+        # try and pre-empt that task.
+        # N.B.: This is a best-effort attempt; the task we end up with may get
+        # descheduled before we get around to cancelling it. However, that's
+        # fine - it's not a correctness issue to deschedule the task. The
+        # important thing is that the thread re-enter the scheduler to pick up
+        # our cancelled task.
+        # In the future, we may want to use the same mechanism for more general
+        # pre-emption, but this helps avoid situations where tasks that have
+        # cancellation points, but no yield points become uncancellable.
+        tid = Threads.threadid(t)
+        if tid != 0
+            ccall(:jl_preempt_thread_task, Cvoid, (Int16,), (tid - 1) % Int16)
+        end
+    end
+end
+
+function cancel_wait!(q::StickyWorkqueue, t::Task)
+    # Tasks in the workqueue are runnable - we do not cancel the wait,
+    # but we do need to check whether it's in there
+    lock(q.lock)
+    try
+        return (t in q.queue)
+    finally
+        unlock(q.lock)
+    end
 end
 
 """
diff --git a/src/codegen.cpp b/src/codegen.cpp
index c193da2c436d6..eb3a7912ee081 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -4969,7 +4969,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
 
         // 2. Check if cr == NULL || cr == jl_nothing
         Value *is_null = ctx.builder.CreateIsNull(cr_relaxed);
-        Value *nothing_val = literal_pointer_val(ctx, jl_nothing);
+        Value *nothing_val = track_pjlvalue(ctx, literal_pointer_val(ctx, jl_nothing));
         Value *is_nothing = ctx.builder.CreateICmpEQ(decay_derived(ctx, cr_relaxed), decay_derived(ctx, nothing_val));
         Value *no_cancel = ctx.builder.CreateOr(is_null, is_nothing);
 
diff --git a/src/signal-handling.c b/src/signal-handling.c
index 39eb3ceedd53b..f555bd475755d 100644
--- a/src/signal-handling.c
+++ b/src/signal-handling.c
@@ -453,8 +453,13 @@ JL_DLLEXPORT void jl_set_sigint_cond(uv_async_t *cond)
 static void deliver_sigint_notification(void)
 {
     if (JL_TRYLOCK_NOGC(&sigint_cond_lock)) {
-        if (sigint_cond_loc != NULL)
+        if (sigint_cond_loc != NULL) {
             uv_async_send(sigint_cond_loc);
+            // IO only runs on one thread, which may currently be busy - try
+            // to preempt it, so that the IO loop has a chance to run and deliver
+            // this notification.
+            jl_preempt_thread_task(jl_atomic_load_relaxed(&io_loop_tid));
+        }
         JL_UNLOCK_NOGC(&sigint_cond_lock);
     }
 }
diff --git a/src/task.c b/src/task.c
index 68ede8ff55e90..dbf98d3450a6e 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1210,6 +1210,17 @@ void jl_init_tasks(void) JL_GC_DISABLED
 static void NOINLINE JL_NORETURN _start_task(void);
 #endif
 
+static void NOINLINE _handle_start_task_cancellation(jl_value_t *creq)
+{
+    jl_value_t *cancel_handler = jl_get_global(jl_base_module, jl_symbol("handle_cancellation!"));
+    if (!cancel_handler) {
+        jl_safe_printf("Task cancellation requested but Base.handle_cancellation! is not defined\n");
+        jl_exit(1);
+    }
+    jl_value_t *fargs[2] = { cancel_handler, creq };
+    jl_apply(fargs, 2);
+}
+
 static void NOINLINE JL_NORETURN JL_NO_ASAN start_task(void)
 {
 CFI_NORETURN
@@ -1270,6 +1281,12 @@ CFI_NORETURN
                 jl_sigint_safepoint(ptls);
             }
             JL_TIMING(ROOT, ROOT);
+            for (;;) {
+                jl_value_t *creq = jl_atomic_load_relaxed(&ct->cancellation_request);
+                if (creq == jl_nothing)
+                    break;
+                _handle_start_task_cancellation(creq);
+            }
             res = jl_apply(&ct->start, 1);
         }
         JL_CATCH {
@@ -1663,6 +1680,13 @@ JL_DLLEXPORT int8_t jl_get_task_threadpoolid(jl_task_t *t)
     return t->threadpoolid;
 }
 
+JL_DLLEXPORT void jl_preempt_thread_task(int16_t tid)
+{
+    jl_task_t *task = jl_atomic_load_relaxed(&jl_all_tls_states[tid]->current_task);
+    jl_value_t *expected = jl_nothing;
+    // If the task is already being cancelled, that's good enough for preemption
+    jl_atomic_cmpswap(&task->cancellation_request, &expected, jl_box_uint8(0x5));
+}
 
 #ifdef _OS_WINDOWS_
 #if defined(_CPU_X86_)
diff --git a/test/cancellation.jl b/test/cancellation.jl
new file mode 100644
index 0000000000000..baa9f6e7afa46
--- /dev/null
+++ b/test/cancellation.jl
@@ -0,0 +1,78 @@
+const collatz_code = quote
+    collatz(n) = (n & 1) == 1 ? (3n + 1) : (n÷2)
+    function find_collatz_counterexample()
+        i = 1
+        while true
+            j = i
+            while true
+                @Base.cancel_check
+                j = collatz(j)
+                j == 1 && break
+                j == i && error("$j is a collatz counterexample")
+            end
+            i += 1
+        end
+    end
+    function find_collatz_counterexample_inner()
+        i = 1
+        while true
+            j = i
+            while true
+                j = collatz(j)
+                j == 1 && break
+                j == i && return j
+            end
+            i += 1
+        end
+    end
+    function find_collatz_counterexample2()
+        @Base.cancel_check
+        return find_collatz_counterexample_inner()
+    end
+end
+eval(collatz_code)
+
+@testset "cancellation" begin
+    # Test cancellation point for a task that has never started running
+    @test_throws Base.CancellationRequest wait(@task nothing)
+
+
+    # Simple cancellation of `sleep`
+    t = @async sleep(10000)
+    yield(); yield(); yield() # Give the task a chance to start
+    Base.cancel!(t)
+    @test_throws Base.CancellationRequest wait(t)
+
+    # Simple cancellation of compute-bound function
+    t = @Threads.spawn(find_collatz_counterexample())
+    yield(); yield(); yield() # Give the task a chance to start
+    Base.cancel!(t)
+    @test_throws Base.CancellationRequest wait(t)
+
+    # Test cancellation of @sync
+    t = @async @sync begin
+        @async sleep(10000)
+        @async sleep(10000)
+    end
+    yield(); yield(); yield() # Give the task a chance to start
+    Base.cancel!(t)
+    @test_throws Base.CompositeException wait(t)
+end
+
+@testset "^C" begin
+    # This exercises ^C needing to preempt a compute-bound task
+    test_code = :(try;
+        @sync ((@async sleep(10000)); @async find_collatz_counterexample())
+    catch e
+        println(typeof(e))
+    end)
+    @test read(`$(Base.julia_cmd()) -e '$(string(collatz_code)); $test_code'`, String) == "CompositeException"
+
+    # Make sure that preemption doesn't cause problems if all tasks are blocked
+    test_code = :(try;
+        @sync ((@async sleep(10000)); @async sleep(10000))
+    catch e
+        println(typeof(e))
+    end)
+    @test read(`$(Base.julia_cmd()) -e '$(string(collatz_code)); $test_code'`, String) == "CompositeException"
+end

From e8ea4a7a5e94d59de9be7a4d1c073e415a3cef7e Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliahub.com>
Date: Thu, 4 Dec 2025 01:03:24 +0000
Subject: [PATCH 5/9] Claude's compiler support WIP

---
 AGENTS.md                                     |  14 +
 Compiler/src/effects.jl                       |   3 +
 Compiler/src/optimize.jl                      |  14 +-
 base/task.jl                                  |  29 +-
 src/Makefile                                  |   1 +
 src/codegen.cpp                               |  29 ++
 src/julia.h                                   |   2 +
 src/julia_internal.h                          |   1 +
 src/llvm-cancellation-lowering.cpp            | 311 ++++++++++++++++++
 src/llvm-julia-passes.inc                     |   1 +
 src/llvm-late-gc-lowering.cpp                 |   2 +-
 src/passes.h                                  |   5 +
 src/pipeline.cpp                              |   1 +
 src/signals-unix.c                            |  30 ++
 src/signals-win.c                             |   9 +
 test/cancellation.jl                          |   2 +-
 .../cancellation-lowering-codegen.jl          |  26 ++
 test/llvmpasses/cancellation-lowering.ll      |  82 +++++
 18 files changed, 546 insertions(+), 16 deletions(-)
 create mode 100644 src/llvm-cancellation-lowering.cpp
 create mode 100644 test/llvmpasses/cancellation-lowering-codegen.jl
 create mode 100644 test/llvmpasses/cancellation-lowering.ll

diff --git a/AGENTS.md b/AGENTS.md
index d81defa16f136..f4f934eec9b4b 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -27,6 +27,20 @@ If you made changes to the runtime (any files in `src/`), you will need to rebui
 julia. Run `make -j` to rebuild julia. This process may take up to 10 minutes
 depending on your changes.
 
+### Testing LLVM-related changes
+
+When making changes to LLVM passes or codegen, add `LLVM_ASSERTIONS=1` to `Make.user` to enable
+LLVM assertions. This helps catch IR verification errors early:
+
+```bash
+echo "LLVM_ASSERTIONS=1" >> Make.user
+```
+
+To run LLVM pass tests:
+```bash
+make -C test/llvmpasses <testname>.ll
+```
+
 After making changes, run static analysis checks:
   - First run `make -C src install-analysis-deps` to initialize dependencies (only needed once the first time).
   - Run `make -C src analyze-<filename> --output-sync -j8` (replace `<filename>` with the basename of any C or C++ file you modified, excluding headers).
diff --git a/Compiler/src/effects.jl b/Compiler/src/effects.jl
index 4879a9981524a..915f956d824a2 100644
--- a/Compiler/src/effects.jl
+++ b/Compiler/src/effects.jl
@@ -320,6 +320,7 @@ merge_effectbits(old::Bool, new::Bool) = old & new
 
 is_consistent(effects::Effects)          = effects.consistent === ALWAYS_TRUE
 is_effect_free(effects::Effects)         = effects.effect_free === ALWAYS_TRUE
+is_reset_safe(effects::Effects)          = effects.reset_safe === ALWAYS_TRUE
 is_nothrow(effects::Effects)             = effects.nothrow
 is_terminates(effects::Effects)          = effects.terminates
 is_notaskstate(effects::Effects)         = effects.notaskstate
@@ -356,6 +357,8 @@ is_consistent_if_inaccessiblememonly(effects::Effects) = !iszero(effects.consist
 
 is_effect_free_if_inaccessiblememonly(effects::Effects) = !iszero(effects.effect_free & EFFECT_FREE_IF_INACCESSIBLEMEMONLY)
 
+is_reset_safe_if_inaccessiblememonly(effects::Effects) = !iszero(effects.reset_safe & RESET_SAFE_IF_INACCESSIBLEMEMONLY)
+
 is_inaccessiblemem_or_argmemonly(effects::Effects) = effects.inaccessiblememonly === INACCESSIBLEMEM_OR_ARGMEMONLY
 
 is_consistent_overlay(effects::Effects) = effects.nonoverlayed === CONSISTENT_OVERLAY
diff --git a/Compiler/src/optimize.jl b/Compiler/src/optimize.jl
index b5704c488d273..7d3d9a1dacdef 100644
--- a/Compiler/src/optimize.jl
+++ b/Compiler/src/optimize.jl
@@ -39,6 +39,9 @@ const IR_FLAG_NOUB        = one(UInt32) << 10
 #const IR_FLAG_CONSISTENTOVERLAY = one(UInt32) << 12
 # This statement is :nortcall
 const IR_FLAG_NORTCALL = one(UInt32) << 13
+# This statement is proven :reset_safe
+const IR_FLAG_RESET_SAFE = one(UInt32) << 14
+# Reserved: one(UInt32) << 15 used for RSIIMO below
 # An optimization pass has updated this statement in a way that may
 # have exposed information that inference did not see. Re-running
 # inference on this statement may be profitable.
@@ -50,16 +53,18 @@ const IR_FLAG_UNUSED      = one(UInt32) << 17
 const IR_FLAG_EFIIMO      = one(UInt32) << 18
 # This statement is :inaccessiblememonly == INACCESSIBLEMEM_OR_ARGMEMONLY
 const IR_FLAG_INACCESSIBLEMEM_OR_ARGMEM = one(UInt32) << 19
+# This statement is :reset_safe == RESET_SAFE_IF_INACCESSIBLEMEMONLY
+const IR_FLAG_RSIIMO      = one(UInt32) << 20
 
 const NUM_IR_FLAGS = 3 # sync with julia.h
 
 const IR_FLAGS_EFFECTS =
     IR_FLAG_CONSISTENT | IR_FLAG_EFFECT_FREE | IR_FLAG_NOTHROW |
-    IR_FLAG_TERMINATES | IR_FLAG_NOUB | IR_FLAG_NORTCALL
+    IR_FLAG_TERMINATES | IR_FLAG_NOUB | IR_FLAG_NORTCALL | IR_FLAG_RESET_SAFE
 
 const IR_FLAGS_REMOVABLE = IR_FLAG_EFFECT_FREE | IR_FLAG_NOTHROW | IR_FLAG_TERMINATES
 
-const IR_FLAGS_NEEDS_EA = IR_FLAG_EFIIMO | IR_FLAG_INACCESSIBLEMEM_OR_ARGMEM
+const IR_FLAGS_NEEDS_EA = IR_FLAG_EFIIMO | IR_FLAG_INACCESSIBLEMEM_OR_ARGMEM | IR_FLAG_RSIIMO
 
 has_flag(curr::UInt32, flag::UInt32) = (curr & flag) == flag
 
@@ -79,6 +84,11 @@ function flags_for_effects(effects::Effects)
     elseif is_effect_free_if_inaccessiblememonly(effects)
         flags |= IR_FLAG_EFIIMO
     end
+    if is_reset_safe(effects)
+        flags |= IR_FLAG_RESET_SAFE
+    elseif is_reset_safe_if_inaccessiblememonly(effects)
+        flags |= IR_FLAG_RSIIMO
+    end
     if is_nothrow(effects)
         flags |= IR_FLAG_NOTHROW
     end
diff --git a/base/task.jl b/base/task.jl
index ae1f9f4b3ff74..70b2e258de06f 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -1323,7 +1323,7 @@ struct CancellationRequest
 end
 
 """
-	CANCEL_REQUEST_SAFE
+    CANCEL_REQUEST_SAFE
 
 Request safe cancelation of the current task. If the task is waiting for any
 other resources, it will request safe cancellation of any such resources and
@@ -1337,7 +1337,7 @@ should be tried first.
 const CANCEL_REQUEST_SAFE = CancellationRequest(0x0)
 
 """
-	CANCEL_REQUEST_ACK
+    CANCEL_REQUEST_ACK
 
 Set by the task itself to indicate that a (safe) cancellation request was
 received and acknowledged, but that there are dependent tasks for whom
@@ -1346,20 +1346,20 @@ cancelation is still pending.
 const CANCEL_REQUEST_ACK = CancellationRequest(0x1)
 
 """
-	CANCEL_REQUEST_QUERY
+    CANCEL_REQUEST_QUERY
 
 Request that the system create an asynchronous report of why the task is currently
 not able to be canceled. The report will be provided in the ->cancelation_request
 field of the current task (as long as this field is still CANCEL_REQUEST_QUERY).
 
 N.B.: Transition to CANCEL_REQUEST_QUERY is only allowed from CANCEL_REQUEST_ACK.
-	  Once the waiting task has read the cancelation report, it may set the cancelation
-	  request back to CANCEL_REQUEST_ACK.
+      Once the waiting task has read the cancelation report, it may set the cancelation
+      request back to CANCEL_REQUEST_ACK.
 """
 const CANCEL_REQUEST_QUERY = CancellationRequest(0x2)
 
 """
-	CANCEL_REQUEST_ABANDON_EXTERNAL
+    CANCEL_REQUEST_ABANDON_EXTERNAL
 
 Request a cancelation that will cease waiting for any external resources (e.g. I/O objects)
 without going through a safe cancelation procedure for such resources. However, the
@@ -1371,21 +1371,21 @@ I/O is often engineered for robustness in case of sudden disapperance of peers
 const CANCEL_REQUEST_ABANDON_EXTERNAL = CancellationRequest(0x3)
 
 """
-	CANCEL_REQUEST_ABANDON_ALL
+    CANCEL_REQUEST_ABANDON_ALL
 
 Request a cancelation that will cease waiting for all external resources and all unacknowledged
 internal tasks. Such tasks will be frozen and become unschedulable in the future.
 
 !!! warning
-	If any canceled task has acquired locks or other resources that are contested, this method of
-	cancelation may leak such resources and create deadlocks in future code. It is intended as a
-	last-resort method to recover a system, but the necessity of this operation should in general
-	be considered a bug (e.g. due to insufficient cancellation points in computationally-heavy code).
+    If any canceled task has acquired locks or other resources that are contested, this method of
+    cancelation may leak such resources and create deadlocks in future code. It is intended as a
+    last-resort method to recover a system, but the necessity of this operation should in general
+    be considered a bug (e.g. due to insufficient cancellation points in computationally-heavy code).
 """
 const CANCEL_REQUEST_ABANDON_ALL = CancellationRequest(0x4)
 
 """
-	CANCEL_REQUEST_YIELD
+    CANCEL_REQUEST_YIELD
 
 Request that the task yield to the scheduler at the next cancellation point to
 allow another task to run its cancellation propagation logic. The cancelled task
@@ -1465,6 +1465,11 @@ function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE)
         end
         return
     end
+    # Try to interrupt the task if it's at a cancellation point (has reset_ctx set)
+    tid = Threads.threadid(t)
+    if tid != 0
+        ccall(:jl_send_cancellation_signal, Cvoid, (Int16,), (tid - 1) % Int16)
+    end
     while !istaskdone(t)
         waitee = t.queue
         waitee === nothing && (yield(); continue)
diff --git a/src/Makefile b/src/Makefile
index 832f0b0ae71e4..11d17718785d9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -84,6 +84,7 @@ CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \
 	llvm-pass-helpers llvm-ptls llvm-propagate-addrspaces \
 	llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers cgmemmgr llvm-remove-addrspaces \
 	llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures llvm-expand-atomic-modify \
+	llvm-cancellation-lowering \
 	pipeline llvm_api \
 	$(GC_CODEGEN_SRCS)
 FLAGS_COMMON += -I$(shell $(LLVM_CONFIG_HOST) --includedir)
diff --git a/src/codegen.cpp b/src/codegen.cpp
index eb3a7912ee081..4a019c8412c8e 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -1900,6 +1900,7 @@ class jl_codectx_t {
     int nargs = 0;
     int nvargs = -1;
     bool is_opaque_closure = false;
+    ssize_t current_stmt_idx = -1; // current statement index for ssaflags lookup
 
     Value *pgcstack = NULL;
     Instruction *topalloca = NULL;
@@ -2085,6 +2086,28 @@ static Value *emit_ptrgep(jl_codectx_t &ctx, Value *base, size_t byte_offset, co
     return gep;
 }
 
+// Check if the current statement has the reset_safe flag set
+static bool current_stmt_is_reset_safe(jl_codectx_t &ctx)
+{
+    if (ctx.current_stmt_idx < 0 || ctx.source == nullptr || ctx.source->ssaflags == nullptr)
+        return false;
+    size_t nstmts = jl_array_dim0(ctx.source->ssaflags);
+    if ((size_t)ctx.current_stmt_idx >= nstmts)
+        return false;
+    uint32_t flag = jl_array_data(ctx.source->ssaflags, uint32_t)[ctx.current_stmt_idx];
+    return (flag & IR_FLAG_RESET_SAFE) != 0;
+}
+
+// Mark a call instruction with reset_safe metadata if the current statement has the flag
+static void mark_reset_safe(jl_codectx_t &ctx, CallInst *call)
+{
+    if (call && current_stmt_is_reset_safe(ctx)) {
+        LLVMContext &llvmctx = ctx.builder.getContext();
+        MDNode *md = MDNode::get(llvmctx, None);
+        call->setMetadata("julia.reset_safe", md);
+    }
+}
+
 static Value *emit_ptrgep(jl_codectx_t &ctx, Value *base, Value *byte_offset, const Twine &Name="")
 {
     auto *gep = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), base, byte_offset, Name);
@@ -5027,6 +5050,8 @@ static CallInst *emit_jlcall(jl_codectx_t &ctx, Value *theFptr, Value *theF,
     }
     CallInst *result = ctx.builder.CreateCall(TheTrampoline, theArgs);
     result->setAttributes(TheTrampoline->getAttributes());
+    // Mark as reset_safe if the current statement has that flag
+    mark_reset_safe(ctx, result);
     // TODO: we could add readonly attributes in many cases to the args
     return result;
 }
@@ -5141,6 +5166,8 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
     call->setAttributes(returninfo.attrs);
     if (gcstack_arg && ctx.emission_context.use_swiftcc)
         call->setCallingConv(CallingConv::Swift);
+    // Mark as reset_safe if the current statement has that flag
+    mark_reset_safe(ctx, call);
 
     jl_cgval_t retval;
     switch (returninfo.cc) {
@@ -9467,7 +9494,9 @@ static jl_llvm_functions_t
             }
         }
         else {
+            ctx.current_stmt_idx = cursor;
             emit_stmtpos(ctx, stmt, cursor);
+            ctx.current_stmt_idx = -1;
             mallocVisitStmt(nullptr, have_dbg_update);
         }
         find_next_stmt(cursor + 1);
diff --git a/src/julia.h b/src/julia.h
index 9513dfb47941d..18d549d70d20d 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -2324,6 +2324,8 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_value_t*, jl_value_t*, size_t);
 JL_DLLEXPORT void jl_switchto(jl_task_t **pt);
 JL_DLLEXPORT int jl_set_task_tid(jl_task_t *task, int16_t tid) JL_NOTSAFEPOINT;
 JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void jl_preempt_thread_task(int16_t tid);
+JL_DLLEXPORT void jl_send_cancellation_signal(int16_t tid);
 JL_DLLEXPORT void JL_NORETURN jl_throw(jl_value_t *e JL_MAYBE_UNROOTED);
 JL_DLLEXPORT void JL_NORETURN jl_rethrow(void);
 JL_DLLEXPORT void JL_NORETURN jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED);
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 9a85d1086d694..c2e6aa7634983 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -2087,6 +2087,7 @@ JL_DLLEXPORT uint32_t jl_crc32c(uint32_t crc, const char *buf, size_t len);
 // -- exports from codegen -- //
 
 #define IR_FLAG_INBOUNDS 0x01
+#define IR_FLAG_RESET_SAFE (1 << 14)
 
 JL_DLLIMPORT void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec);
 JL_DLLIMPORT int jl_compile_codeinst(jl_code_instance_t *unspec);
diff --git a/src/llvm-cancellation-lowering.cpp b/src/llvm-cancellation-lowering.cpp
new file mode 100644
index 0000000000000..9d589f59045eb
--- /dev/null
+++ b/src/llvm-cancellation-lowering.cpp
@@ -0,0 +1,311 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// This pass lowers the julia.cancellation_point intrinsic to:
+// 1. A stack buffer allocation for the jl_ucontext_t
+// 2. A setjmp call on the uc_mcontext field
+// 3. Assignment of the buffer address to task->reset_ctx (atomic release)
+//
+// It also walks the function to find stores/calls without julia.reset_safe
+// metadata and inserts reset_ctx = NULL before them.
+
+#include "llvm-version.h"
+#include "passes.h"
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/ADT/Statistic.h>
+
+#include "llvm-codegen-shared.h"
+#include "llvm-pass-helpers.h"
+#include "julia.h"
+#include "julia_internal.h"
+#include "julia_threads.h"
+
+#define DEBUG_TYPE "cancellation_lowering"
+
+STATISTIC(CancellationPointsLowered, "Number of cancellation points lowered");
+STATISTIC(ResetCtxClearsInserted, "Number of reset_ctx clears inserted");
+
+using namespace llvm;
+
+// Check if an instruction has the julia.reset_safe metadata
+static bool hasResetSafeMetadata(Instruction *I) {
+    return I->getMetadata("julia.reset_safe") != nullptr;
+}
+
+struct CancellationLowering {
+    Function *cancel_point_func;
+    Value *pgcstack;
+    Value *reset_ctx_ptr;  // Computed once in entry block, dominates all uses
+
+    CancellationLowering(Module &M) : cancel_point_func(nullptr), pgcstack(nullptr), reset_ctx_ptr(nullptr) {
+        cancel_point_func = M.getFunction("julia.cancellation_point");
+    }
+
+    bool runOnFunction(Function &F);
+
+private:
+    // Compute reset_ctx_ptr once in entry block
+    // If insertAfter is provided, insert after that instruction
+    // Otherwise insert at the beginning of the entry block (after allocas)
+    void computeResetCtxPtr(Function &F, Instruction *insertAfter);
+};
+
+void CancellationLowering::computeResetCtxPtr(Function &F, Instruction *insertAfter) {
+    if (!pgcstack)
+        return;
+
+    LLVMContext &LLVMCtx = F.getContext();
+    Type *I8Ty = Type::getInt8Ty(LLVMCtx);
+    Type *I64Ty = Type::getInt64Ty(LLVMCtx);
+
+    IRBuilder<> Builder(LLVMCtx);
+    if (insertAfter) {
+        // Insert right after pgcstack call
+        Builder.SetInsertPoint(insertAfter->getNextNode());
+    } else {
+        // pgcstack is an argument, insert at the start of entry block
+        // but after any allocas
+        BasicBlock &entry = F.getEntryBlock();
+        BasicBlock::iterator insertPt = entry.begin();
+        while (insertPt != entry.end() && isa<AllocaInst>(&*insertPt)) {
+            ++insertPt;
+        }
+        Builder.SetInsertPoint(&entry, insertPt);
+    }
+
+    // Get the offset of gcstack in jl_task_t
+    size_t gcstack_offset = offsetof(jl_task_t, gcstack);
+
+    Value *task_ptr = Builder.CreateGEP(I8Ty, pgcstack,
+                                        ConstantInt::get(I64Ty, -(int64_t)gcstack_offset),
+                                        "current_task");
+
+    // Get pointer to reset_ctx field in current task
+    size_t reset_ctx_offset = offsetof(jl_task_t, reset_ctx);
+
+    reset_ctx_ptr = Builder.CreateGEP(I8Ty, task_ptr,
+                                       ConstantInt::get(I64Ty, reset_ctx_offset),
+                                       "reset_ctx_ptr");
+}
+
+bool CancellationLowering::runOnFunction(Function &F) {
+    // Skip if there's no cancellation_point function in the module
+    if (!cancel_point_func)
+        return false;
+
+    bool Changed = false;
+
+    // Find pgcstack - either as a call to julia.get_pgcstack or as an argument with "gcstack" attribute
+    pgcstack = nullptr;
+    reset_ctx_ptr = nullptr;
+    Instruction *pgcstack_inst = nullptr;  // Only set if pgcstack is from a call, not an argument
+    Function *pgcstack_getter = F.getParent()->getFunction("julia.get_pgcstack");
+    Function *adoptthread_func = F.getParent()->getFunction("julia.get_pgcstack_or_new");
+    if (pgcstack_getter || adoptthread_func) {
+        for (auto &I : F.getEntryBlock()) {
+            if (CallInst *callInst = dyn_cast<CallInst>(&I)) {
+                Value *callee = callInst->getCalledOperand();
+                if ((pgcstack_getter && callee == pgcstack_getter) ||
+                    (adoptthread_func && callee == adoptthread_func)) {
+                    pgcstack = callInst;
+                    pgcstack_inst = callInst;
+                    break;
+                }
+            }
+        }
+    }
+    // If not found via call, check for argument with "gcstack" attribute
+    if (!pgcstack) {
+        for (auto &arg : F.args()) {
+            AttributeSet attrs = F.getAttributes().getParamAttrs(arg.getArgNo());
+            if (attrs.hasAttribute("gcstack")) {
+                pgcstack = &arg;
+                break;
+            }
+        }
+    }
+
+    // First, find all cancellation_point intrinsics
+    SmallVector<CallInst*, 4> CancellationPoints;
+
+    for (auto &BB : F) {
+        for (auto &I : BB) {
+            if (auto *CI = dyn_cast<CallInst>(&I)) {
+                Value *callee = CI->getCalledOperand();
+                if (callee && callee == cancel_point_func) {
+                    CancellationPoints.push_back(CI);
+                }
+            }
+        }
+    }
+
+    if (CancellationPoints.empty()) {
+        return false;
+    }
+
+    // Compute reset_ctx_ptr once in entry block (dominates all uses)
+    // pgcstack_inst is set when pgcstack comes from a call; null when it's an argument
+    computeResetCtxPtr(F, pgcstack_inst);
+
+    // Lower each cancellation point
+    for (CallInst *CI : CancellationPoints) {
+        ++CancellationPointsLowered;
+        Changed = true;
+
+        IRBuilder<> Builder(CI);
+        LLVMContext &LLVMCtx = F.getContext();
+        Type *I8Ty = Type::getInt8Ty(LLVMCtx);
+        Type *I32Ty = Type::getInt32Ty(LLVMCtx);
+        Type *PtrTy = PointerType::getUnqual(LLVMCtx);
+
+        if (!reset_ctx_ptr) {
+            // Can't lower without access to the task, just remove the intrinsic
+            CI->replaceAllUsesWith(ConstantInt::get(I32Ty, 0));
+            CI->eraseFromParent();
+            continue;
+        }
+
+        // Allocate a _jl_ucontext_t on the stack
+        const size_t UContextSize = sizeof(_jl_ucontext_t);
+        const size_t UContextAlign = alignof(_jl_ucontext_t);
+
+        // Create the alloca at the start of the function
+        IRBuilder<> AllocaBuilder(&F.getEntryBlock().front());
+        Type *UContextTy = ArrayType::get(I8Ty, UContextSize);
+        AllocaInst *UContextBuf = AllocaBuilder.CreateAlloca(UContextTy, nullptr, "cancel_ucontext");
+        UContextBuf->setAlignment(Align(UContextAlign));
+
+        // Store the ucontext address to reset_ctx with atomic release ordering
+        StoreInst *store = Builder.CreateAlignedStore(UContextBuf, reset_ctx_ptr, Align(sizeof(void*)));
+        store->setOrdering(AtomicOrdering::Release);
+
+        // Call setjmp on the uc_mcontext field (which is at offset 0 of the struct)
+        // Use the platform-specific setjmp function name defined in julia.h
+        FunctionType *SetjmpTy = FunctionType::get(I32Ty, {PtrTy}, false);
+        FunctionCallee SetjmpFn = F.getParent()->getOrInsertFunction(jl_setjmp_name, SetjmpTy);
+
+        CallInst *SetjmpCall = Builder.CreateCall(SetjmpFn, {UContextBuf});
+        SetjmpCall->addFnAttr(Attribute::ReturnsTwice);
+
+        // Replace uses and remove the intrinsic
+        CI->replaceAllUsesWith(SetjmpCall);
+        CI->eraseFromParent();
+    }
+
+    // Now walk the function to find stores/calls without reset_safe metadata
+    // and insert reset_ctx = NULL before them
+    SmallVector<Instruction*, 16> UnsafePoints;
+
+    // We need to skip instructions that are part of our setup (pgcstack, task, reset_ctx_ptr)
+    // since they occur before reset_ctx_ptr is available
+    Instruction *reset_ctx_ptr_inst = dyn_cast_or_null<Instruction>(reset_ctx_ptr);
+
+    for (auto &BB : F) {
+        bool past_setup = (&BB != &F.getEntryBlock());
+        for (auto &I : BB) {
+            // In the entry block, skip instructions until after reset_ctx_ptr is defined
+            if (!past_setup) {
+                if (&I == reset_ctx_ptr_inst) {
+                    past_setup = true;
+                }
+                continue;
+            }
+
+            // Check for stores (but skip the reset_ctx stores we just created)
+            if (auto *SI = dyn_cast<StoreInst>(&I)) {
+                if (!hasResetSafeMetadata(SI)) {
+                    // Skip atomic stores (including reset_ctx stores we just created)
+                    if (SI->isAtomic())
+                        continue;
+                    UnsafePoints.push_back(SI);
+                }
+            }
+            // Check for calls (but not debug intrinsics, lifetime markers, etc.)
+            else if (auto *CI = dyn_cast<CallInst>(&I)) {
+                // Skip debug intrinsics and other harmless intrinsics
+                if (isa<DbgInfoIntrinsic>(CI))
+                    continue;
+                if (CI->isLifetimeStartOrEnd())
+                    continue;
+
+                // Check for reset_safe metadata
+                if (!hasResetSafeMetadata(CI)) {
+                    // Also skip intrinsic calls that are known safe
+                    Function *Callee = CI->getCalledFunction();
+                    if (Callee && Callee->isIntrinsic()) {
+                        Intrinsic::ID ID = Callee->getIntrinsicID();
+                        if (ID == Intrinsic::lifetime_start ||
+                            ID == Intrinsic::lifetime_end ||
+                            ID == Intrinsic::dbg_declare ||
+                            ID == Intrinsic::dbg_value ||
+                            ID == Intrinsic::dbg_label ||
+                            ID == Intrinsic::assume ||
+                            ID == Intrinsic::expect ||
+                            ID == Intrinsic::prefetch) {
+                            continue;
+                        }
+                    }
+                    // Skip the setjmp calls we just created
+                    if (Callee && Callee->getName() == jl_setjmp_name)
+                        continue;
+                    UnsafePoints.push_back(CI);
+                }
+            }
+        }
+    }
+
+    // Insert reset_ctx = NULL before each unsafe point
+    for (Instruction *I : UnsafePoints) {
+        if (!reset_ctx_ptr)
+            continue;
+
+        ++ResetCtxClearsInserted;
+        Changed = true;
+
+        IRBuilder<> Builder(I);
+        LLVMContext &LLVMCtx = F.getContext();
+        Type *PtrTy = PointerType::getUnqual(LLVMCtx);
+
+        // Store NULL to reset_ctx with atomic release ordering
+        Value *null_ptr = ConstantPointerNull::get(cast<PointerType>(PtrTy));
+        StoreInst *store = Builder.CreateAlignedStore(null_ptr, reset_ctx_ptr, Align(sizeof(void*)));
+        store->setOrdering(AtomicOrdering::Release);
+    }
+
+    // Insert reset_ctx = NULL before all return instructions
+    // This is necessary because the cancel_ucontext buffer is stack-allocated,
+    // and becomes invalid when the function returns
+    if (reset_ctx_ptr) {
+        LLVMContext &LLVMCtx = F.getContext();
+        Type *PtrTy = PointerType::getUnqual(LLVMCtx);
+        Value *null_ptr = ConstantPointerNull::get(cast<PointerType>(PtrTy));
+
+        for (auto &BB : F) {
+            if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+                IRBuilder<> Builder(RI);
+                StoreInst *store = Builder.CreateAlignedStore(null_ptr, reset_ctx_ptr, Align(sizeof(void*)));
+                store->setOrdering(AtomicOrdering::Release);
+                ++ResetCtxClearsInserted;
+            }
+        }
+    }
+
+    return Changed;
+}
+
+PreservedAnalyses CancellationLoweringPass::run(Function &F, FunctionAnalysisManager &AM) {
+    CancellationLowering CL(*F.getParent());
+    if (CL.runOnFunction(F)) {
+#ifdef JL_VERIFY_PASSES
+        assert(!verifyLLVMIR(F));
+#endif
+        return PreservedAnalyses::allInSet<CFGAnalyses>();
+    }
+    return PreservedAnalyses::all();
+}
diff --git a/src/llvm-julia-passes.inc b/src/llvm-julia-passes.inc
index bd223499f37af..6f860ead8af84 100644
--- a/src/llvm-julia-passes.inc
+++ b/src/llvm-julia-passes.inc
@@ -16,6 +16,7 @@ FUNCTION_PASS("AllocOpt", AllocOptPass())
 FUNCTION_PASS("PropagateJuliaAddrspaces", PropagateJuliaAddrspacesPass())
 FUNCTION_PASS("GCInvariantVerifier", GCInvariantVerifierPass())
 FUNCTION_PASS("FinalLowerGC", FinalLowerGCPass())
+FUNCTION_PASS("CancellationLowering", CancellationLoweringPass())
 FUNCTION_PASS("ExpandAtomicModify", ExpandAtomicModifyPass())
 #endif
 
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 90334e60251da..ae1351ae41ca1 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2068,7 +2068,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
             }
             Value *callee = CI->getCalledOperand();
             if (callee && (callee == gc_flush_func || callee == gc_preserve_begin_func
-                        || callee == gc_preserve_end_func || callee == cancel_point_func)) {
+                        || callee == gc_preserve_end_func)) {
                 /* No replacement */
             } else if (pointer_from_objref_func != nullptr && callee == pointer_from_objref_func) {
                 auto *obj = CI->getOperand(0);
diff --git a/src/passes.h b/src/passes.h
index 0c5a124ade952..e40538f8f88da 100644
--- a/src/passes.h
+++ b/src/passes.h
@@ -43,6 +43,11 @@ struct FinalLowerGCPass : PassInfoMixin<FinalLowerGCPass> {
     static bool isRequired() { return true; }
 };
 
+struct CancellationLoweringPass : PassInfoMixin<CancellationLoweringPass> {
+    PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) JL_NOTSAFEPOINT;
+    static bool isRequired() { return true; }
+};
+
 struct ExpandAtomicModifyPass : PassInfoMixin<ExpandAtomicModifyPass> {
     PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) JL_NOTSAFEPOINT;
 };
diff --git a/src/pipeline.cpp b/src/pipeline.cpp
index 0481e04b8d19e..afd2ac90f9f6b 100644
--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -568,6 +568,7 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder *
         JULIA_PASS(MPM.addPass(RemoveNIPass()));
         {
             FunctionPassManager FPM;
+            JULIA_PASS(FPM.addPass(CancellationLoweringPass())); // Lower cancellation points to setjmp (before GC lowering)
             JULIA_PASS(FPM.addPass(LateLowerGCPass()));
             JULIA_PASS(FPM.addPass(FinalLowerGCPass()));
             JULIA_PASS(FPM.addPass(ExpandAtomicModifyPass())); // after LateLowerGCPass so that all IPO is valid
diff --git a/src/signals-unix.c b/src/signals-unix.c
index ec34d97cd89bc..6f061af906352 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -545,6 +545,26 @@ static void jl_try_deliver_sigint(void)
     pthread_mutex_unlock(&in_signal_lock);
 }
 
+// Send a signal to the specified thread to longjmp to its reset_ctx if available.
+// This is used for task cancellation to interrupt a running task at a safe point.
+JL_DLLEXPORT void jl_send_cancellation_signal(int16_t tid)
+{
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+    if (ptls2 == NULL)
+        return;
+    jl_task_t *ct = jl_atomic_load_relaxed(&ptls2->current_task);
+    if (ct == NULL)
+        return;
+    // Only send if the task has a reset_ctx set (i.e., is at a cancellation point)
+    if (ct->reset_ctx == NULL)
+        return;
+    pthread_mutex_lock(&in_signal_lock);
+    signals_inflight++;
+    jl_atomic_store_release(&ptls2->signal_request, 5);
+    pthread_kill(ptls2->system_id, SIGUSR2);
+    pthread_mutex_unlock(&in_signal_lock);
+}
+
 // Write only by signal handling thread, read only by main thread
 // no sync necessary.
 static int thread0_exit_signo = 0;
@@ -583,6 +603,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
 //     is reached
 //  3: raise `thread0_exit_signo` and try to exit
 //  4: no-op
+//  5: longjmp to reset_ctx if available (for task cancellation)
 void usr2_handler(int sig, siginfo_t *info, void *ctx)
 {
     jl_task_t *ct = jl_get_current_task();
@@ -640,6 +661,15 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
     else if (request == 3) {
         jl_call_in_ctx(ct->ptls, jl_exit_thread0_cb, sig, ctx);
     }
+    else if (request == 5) {
+        // Longjmp to reset_ctx for task cancellation
+        volatile _jl_ucontext_t *reset_ctx = ct->reset_ctx;
+        if (reset_ctx != NULL) {
+            // Clear reset_ctx before longjmp to prevent double-longjmp
+            ct->reset_ctx = NULL;
+            jl_longjmp_in_ctx(sig, ctx, reset_ctx->uc_mcontext);
+        }
+    }
     errno = errno_save;
 }
 
diff --git a/src/signals-win.c b/src/signals-win.c
index e0968b35e8555..d68b19298f005 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -667,4 +667,13 @@ void jl_install_thread_signal_handler(jl_ptls_t ptls)
 
 JL_DLLEXPORT void jl_membarrier(void) {
     FlushProcessWriteBuffers();
+
+
+// Send a signal to the specified thread to longjmp to its reset_ctx if available.
+// This is used for task cancellation to interrupt a running task at a safe point.
+// TODO: Implement Windows support using SuspendThread/GetThreadContext/SetThreadContext
+JL_DLLEXPORT void jl_send_cancellation_signal(int16_t tid)
+{
+    // Not yet implemented on Windows
+    (void)tid;
 }
diff --git a/test/cancellation.jl b/test/cancellation.jl
index baa9f6e7afa46..6a673e3ebb4d3 100644
--- a/test/cancellation.jl
+++ b/test/cancellation.jl
@@ -13,7 +13,7 @@ const collatz_code = quote
             i += 1
         end
     end
-    function find_collatz_counterexample_inner()
+    @noinline function find_collatz_counterexample_inner()
         i = 1
         while true
             j = i
diff --git a/test/llvmpasses/cancellation-lowering-codegen.jl b/test/llvmpasses/cancellation-lowering-codegen.jl
new file mode 100644
index 0000000000000..b9395baa0408c
--- /dev/null
+++ b/test/llvmpasses/cancellation-lowering-codegen.jl
@@ -0,0 +1,26 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+# RUN: julia --startup-file=no -O2 %s %t -O && llvm-link -S %t/* | FileCheck %s
+
+include(joinpath("..", "testhelpers", "llvmpasses.jl"))
+
+# Test that @cancel_check generates the expected cancellation lowering IR:
+# - A jl_setjmp call with returns_twice attribute
+# - A reset_ctx_ptr getelementptr
+# - Atomic store of ucontext buffer to reset_ctx
+# - Atomic store of null to reset_ctx before return (since ucontext is stack-allocated)
+
+# CHECK-LABEL: @julia_test_cancel_check
+# CHECK: %cancel_ucontext = alloca
+# CHECK: %reset_ctx_ptr = getelementptr
+# CHECK: store atomic ptr {{.*}}, ptr %reset_ctx_ptr release
+# CHECK: call i32 @{{.*}}setjmp{{.*}}(ptr {{.*}}) #[[ATTR:[0-9]+]]
+# CHECK: store atomic ptr null, ptr %reset_ctx_ptr release
+# CHECK: ret
+# CHECK: attributes #[[ATTR]] = {{{.*}}returns_twice{{.*}}}
+function test_cancel_check()
+    Base.@cancel_check
+    return 1
+end
+
+emit(test_cancel_check)
diff --git a/test/llvmpasses/cancellation-lowering.ll b/test/llvmpasses/cancellation-lowering.ll
new file mode 100644
index 0000000000000..22245b64288c3
--- /dev/null
+++ b/test/llvmpasses/cancellation-lowering.ll
@@ -0,0 +1,82 @@
+; This file is a part of Julia. License is MIT: https://julialang.org/license
+
+; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='CancellationLowering' -S %s | FileCheck %s
+
+declare i32 @julia.cancellation_point()
+declare ptr @julia.get_pgcstack()
+declare void @some_unsafe_call()
+declare void @some_safe_call()
+
+; Test basic cancellation point lowering with reset_ctx cleared before return
+define i32 @test_cancellation_point() {
+entry:
+; CHECK-LABEL: @test_cancellation_point
+; CHECK: %cancel_ucontext = alloca
+; CHECK: %pgcstack = call ptr @julia.get_pgcstack()
+; CHECK: %current_task = getelementptr i8, ptr %pgcstack
+; CHECK: %reset_ctx_ptr = getelementptr i8, ptr %current_task
+; CHECK: store atomic ptr %cancel_ucontext, ptr %reset_ctx_ptr release
+; CHECK: %{{.*}} = call i32 @{{.*}}setjmp{{.*}}(ptr %cancel_ucontext)
+; CHECK-NOT: call i32 @julia.cancellation_point()
+; CHECK: store atomic ptr null, ptr %reset_ctx_ptr release
+; CHECK-NEXT: ret i32
+  %pgcstack = call ptr @julia.get_pgcstack()
+  %result = call i32 @julia.cancellation_point()
+  ret i32 %result
+}
+
+; Test that unsafe calls get reset_ctx = NULL inserted before them
+define void @test_unsafe_call() {
+entry:
+; CHECK-LABEL: @test_unsafe_call
+; CHECK: %cancel_ucontext = alloca
+; CHECK: %pgcstack = call ptr @julia.get_pgcstack()
+; CHECK: %current_task = getelementptr i8, ptr %pgcstack
+; CHECK: %reset_ctx_ptr = getelementptr i8, ptr %current_task
+; CHECK: store atomic ptr %cancel_ucontext, ptr %reset_ctx_ptr release
+; CHECK: call i32 @{{.*}}setjmp
+; The unsafe call should have reset_ctx = NULL before it
+; CHECK: store atomic ptr null, ptr %reset_ctx_ptr release
+; CHECK-NEXT: call void @some_unsafe_call()
+; Also reset_ctx = NULL before the return
+; CHECK: store atomic ptr null, ptr %reset_ctx_ptr release
+; CHECK-NEXT: ret void
+  %pgcstack = call ptr @julia.get_pgcstack()
+  %result = call i32 @julia.cancellation_point()
+  call void @some_unsafe_call()
+  ret void
+}
+
+; Test that calls with reset_safe metadata don't get reset_ctx = NULL before them
+; but still get reset_ctx = NULL before return
+define void @test_safe_call() {
+entry:
+; CHECK-LABEL: @test_safe_call
+; CHECK: %cancel_ucontext = alloca
+; CHECK: %pgcstack = call ptr @julia.get_pgcstack()
+; CHECK: %current_task = getelementptr i8, ptr %pgcstack
+; CHECK: %reset_ctx_ptr = getelementptr i8, ptr %current_task
+; CHECK: call i32 @{{.*}}setjmp
+; The safe call should NOT have reset_ctx = NULL before it
+; CHECK: call void @some_safe_call(), !julia.reset_safe
+; But reset_ctx = NULL should be before the return
+; CHECK: store atomic ptr null, ptr %reset_ctx_ptr release
+; CHECK-NEXT: ret void
+  %pgcstack = call ptr @julia.get_pgcstack()
+  %result = call i32 @julia.cancellation_point()
+  call void @some_safe_call(), !julia.reset_safe !0
+  ret void
+}
+
+; Test function without cancellation points is unchanged
+define void @test_no_cancellation_point() {
+entry:
+; CHECK-LABEL: @test_no_cancellation_point
+; CHECK-NOT: setjmp
+; CHECK-NOT: reset_ctx
+; CHECK: call void @some_unsafe_call()
+  call void @some_unsafe_call()
+  ret void
+}
+
+!0 = !{}

From 759d56c0e992537e06af822f12e4b58bfa6cc56e Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliahub.com>
Date: Thu, 4 Dec 2025 01:17:05 +0000
Subject: [PATCH 6/9] Fix missing reset

---
 src/task.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/task.c b/src/task.c
index dbf98d3450a6e..94057e550e3a1 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1686,6 +1686,7 @@ JL_DLLEXPORT void jl_preempt_thread_task(int16_t tid)
     jl_value_t *expected = jl_nothing;
     // If the task is already being cancelled, that's good enough for preemption
     jl_atomic_cmpswap(&task->cancellation_request, &expected, jl_box_uint8(0x5));
+    jl_send_cancellation_signal(tid);
 }
 
 #ifdef _OS_WINDOWS_

From 7b52667bf42b8e527e991c1bd6ac921c20e99cc6 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliahub.com>
Date: Fri, 5 Dec 2025 03:52:27 +0000
Subject: [PATCH 7/9] Fix rebase mistake

---
 src/signals-win.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/signals-win.c b/src/signals-win.c
index d68b19298f005..16b5734ab9c0d 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -667,7 +667,7 @@ void jl_install_thread_signal_handler(jl_ptls_t ptls)
 
 JL_DLLEXPORT void jl_membarrier(void) {
     FlushProcessWriteBuffers();
-
+}
 
 // Send a signal to the specified thread to longjmp to its reset_ctx if available.
 // This is used for task cancellation to interrupt a running task at a safe point.

From 044f2ca2fb2f4616a1605cd8d64a153d6e71c597 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliahub.com>
Date: Fri, 5 Dec 2025 08:31:27 +0000
Subject: [PATCH 8/9] Implement asymmetric barriers

---
 base/asyncevent.jl |  5 ++--
 base/condition.jl  | 49 ++++++++++++++++++++++++-------
 base/locks-mt.jl   |  2 ++
 base/task.jl       | 73 +++++++++++++++++++++++++++++++++++-----------
 4 files changed, 99 insertions(+), 30 deletions(-)

diff --git a/base/asyncevent.jl b/base/asyncevent.jl
index 6ac8e79b63ff8..771d89f796b58 100644
--- a/base/asyncevent.jl
+++ b/base/asyncevent.jl
@@ -199,8 +199,9 @@ function _trywait(t::Union{Timer, AsyncCondition})
     return set
 end
 
-cancel_wait!(t::Union{Timer, AsyncCondition}, task::Task) =
-    cancel_wait!(t.cond, task, false; waitee=t)
+cancel_wait!(t::Union{Timer, AsyncCondition}, @nospecialize(creq)) = false
+cancel_wait!(t::Union{Timer, AsyncCondition}, task::Task, @nospecialize(creq)) =
+    cancel_wait!(t.cond, task, creq, false; waitee=t)
 
 function wait(t::Union{Timer, AsyncCondition})
     ok = _trywait(t)
diff --git a/base/condition.jl b/base/condition.jl
index b4aead93faba3..e3aa7a2dc59ed 100644
--- a/base/condition.jl
+++ b/base/condition.jl
@@ -142,35 +142,62 @@ Wait for [`notify`](@ref) on `c` and return the `val` parameter passed to `notif
 If the keyword `first` is set to `true`, the waiter will be put _first_
 in line to wake up on `notify`. Otherwise, `wait` has first-in-first-out (FIFO) behavior.
 """
-function wait(c::GenericCondition; first::Bool=false, waitee=c)
+function wait(c::GenericCondition; first::Bool=false, waitee=c, expected_cancellation=nothing)
     ct = current_task()
     _wait2(c, ct, waitee, first)
+
+@label before_barrier
+    # Synchronize with atomic_fence_heavy in cancel!
+    Threads.atomic_fence_light()
+    # We need to check if we were cancelled and should not suspend if we were.
+    # The fencing above ensures that we either see the cancellation request
+    # or the cancelling task will call cancel_wait! to wake us again.
+    cr = cancellation_request()
+    if cr !== expected_cancellation
+        if cr === CANCEL_REQUEST_YIELD
+            # We are about to yield anyway, so we can acknowledge the cancellation now.
+            # However, for the integrity of the cancellation_request syncrhonization,
+            # we must revisit the barrier above and re-check the cancellation request
+            @atomicreplace :acquire_release :monotonic ct.cancellation_request cr => nothing
+            @goto before_barrier
+        else
+            Base.list_deletefirst!(waitqueue(c), ct)
+            return invokelatest(cancel_wait!, waitee, cr)
+        end
+    end
     token = unlockall(c.lock)
-    try
-        return wait()
+
+    ret = try
+        wait()
     catch
-        q = ct.queue; q === c && Base.list_deletefirst!(waitqueue(c), ct)
-        rethrow()
-    finally
         relockall(c.lock, token)
+        # This cleans up our entry in the waitqueue if we were resumes from an
+        # unexpected `throwto`. Modern code should generally avoid this pattern.
+        q = ct.queue; q === waitee && Base.list_deletefirst!(waitqueue(c), ct)
+        rethrow()
     end
+
+    relockall(c.lock, token)
+    return ret
+end
+
+function cancel_wait!(c::GenericCondition, creq; waitee = c)
+    throw(creq)
 end
 
-function cancel_wait!(c::GenericCondition, t::Task; waitee = c)
-    @assert (@atomic :monotonic t.cancellation_request) !== nothing
+function cancel_wait!(c::GenericCondition, t::Task, @nospecialize(creq); waitee = c)
     lock(c)
     if t.queue !== waitee
         unlock(c)
         return false
     end
     Base.list_deletefirst!(ILLRef(waitqueue(c), waitee), t)
-    schedule(t, conform_cancellation_request(t.cancellation_request), error=true)
+    schedule(t, conform_cancellation_request(creq), error=true)
     unlock(c)
     return true
 end
 
-function cancel_wait!(c::GenericCondition, t::Task, @nospecialize(val); waitee=c)
-    @assert (@atomic :monotonic t.cancellation_request) !== nothing
+function cancel_wait!(c::GenericCondition, t::Task, @nospecialize(creq), @nospecialize(val); waitee=c)
     lock(c)
     if t.queue !== waitee
         unlock(c)
diff --git a/base/locks-mt.jl b/base/locks-mt.jl
index 237e0d9856996..af8513044c0f3 100644
--- a/base/locks-mt.jl
+++ b/base/locks-mt.jl
@@ -108,3 +108,5 @@ end
 function islocked(l::AbstractSpinLock)
     return (@atomic :monotonic l.owned) != 0
 end
+
+Base.show(io::IO, ::AbstractSpinLock) = print(io, typeof(ans), "(", islocked(ans) ? "locked" : "unlocked", ")")
diff --git a/base/task.jl b/base/task.jl
index 70b2e258de06f..65192ee44b91c 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -316,7 +316,7 @@ function _wait(t::Task; expected_cancellation = nothing)
         lock(donenotify)
         try
             while !istaskdone(t) && cancellation_request() === expected_cancellation
-                wait(donenotify; waitee=t)
+                wait(donenotify; waitee=t, expected_cancellation)
             end
         finally
             unlock(donenotify)
@@ -324,7 +324,11 @@ function _wait(t::Task; expected_cancellation = nothing)
     end
     nothing
 end
-cancel_wait!(waitee::Task, waiter::Task) = cancel_wait!(waitee.donenotify, waiter, nothing; waitee)
+
+# We handle cancellation explicitly above - just suppress the error here
+cancel_wait!(waitee::Task, @nospecialize(creq)) = nothing
+cancel_wait!(waitee::Task, waiter::Task, @nospecialize(creq)) =
+    cancel_wait!(waitee.donenotify, waiter, creq, nothing; waitee)
 
 # have `waiter` wait for `t`
 function _wait2(t::Task, waiter::Task)
@@ -367,7 +371,7 @@ Throws a `ConcurrencyViolationError` if `t` is the currently running task, to pr
 """
 function wait(t::Task; throw=true)
     _wait(t)
-    cr = cancellation_request()
+    cr = cancellation_request_or_yield()
     if cr !== nothing
         propagate_cancellation!(t, cr)
     end
@@ -611,7 +615,7 @@ function sync_end(c::Channel{Any})
         r = take!(c)
         if isa(r, Task)
             _wait(r)
-            cr = cancellation_request()
+            cr = cancellation_request_or_yield()
             if cr !== nothing
                 return sync_cancel!(c, r, cr, @isdefined(c_ex) ? c_ex : CompositeException())
             end
@@ -1423,6 +1427,14 @@ end
     throw(req)
 end
 
+function cancellation_request_raw()
+    ct = current_task()
+    req = @atomic :monotonic ct.cancellation_request
+    req === nothing && return req
+    req = @atomic :acquire ct.cancellation_request
+    return req
+end
+
 """
     cancellation_request()
 
@@ -1431,13 +1443,26 @@ cancellation has been requested. If a cancellation request is present, it is
 loaded with acquire semantics.
 """
 function cancellation_request()
-    ct = current_task()
-    req = @atomic :monotonic ct.cancellation_request
-    req === nothing && return req
-    cr = @atomic :acquire ct.cancellation_request
+    cr = cancellation_request_raw()
     return conform_cancellation_request(cr)
 end
 
+"""
+    cancellation_request_or_yield()
+
+Like [`cancellation_request`](@ref), but specifically handles CANCEL_REQUEST_YIELD
+by calling yield internally and re-checking for cancellation requests.
+"""
+function cancellation_request_or_yield()
+    while true
+        _cr = cancellation_request_raw()
+        cr = conform_cancellation_request(_cr)
+        cr !== CANCEL_REQUEST_YIELD && return cr
+        @atomicreplace :sequentially_consistent :monotonic current_task().cancellation_request _cr => nothing
+        yield()
+    end
+end
+
 """
     Core.cancellation_point!()
 
@@ -1451,7 +1476,7 @@ Core.cancellation_point!
 function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE)
     # TODO: Raise task priority
     @atomic :release t.cancellation_request = crequest
-    # TODO: SYS_membarrier() ?
+    Threads.atomic_fence_heavy()
     # Special case: If the task hasn't started yet at this point, we want to set
     # it up to cancel any waits, but we need to be a bit careful with concurrent
     # starts of the task.
@@ -1465,15 +1490,29 @@ function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE)
         end
         return
     end
-    # Try to interrupt the task if it's at a cancellation point (has reset_ctx set)
+    # Try to interrupt the task. The barrier above synchronizes with the establishment
+    # of a wait object and guarantees that either:
+    # 1. We have the wait object in t.queue, or
+    # 2. The task saw the cancellation and called (a different method of) cancel_wait!
+    #    itself.
+    # Note that it is possible for both to be true, in which case the task wins
+    # and our call to cancel_wait! is will no-op after acquiring the waitee lock.
+    #
+    # Additionally, if there is no wait object, either
+    # 1. The task is suspended, but not using our wait object protocol.
+    #    In this case, cancellation will not succeed.
+    # 2. The task is running.
+    #
+    # We can't tell the difference, but we unconditionally try to send the cancellation
+    # signal. If a reset_ctx exists, this will cause the task to be interrupted.
     tid = Threads.threadid(t)
-    if tid != 0
-        ccall(:jl_send_cancellation_signal, Cvoid, (Int16,), (tid - 1) % Int16)
-    end
-    while !istaskdone(t)
+    if !istaskdone(t)
         waitee = t.queue
-        waitee === nothing && (yield(); continue)
-        invokelatest(cancel_wait!, waitee, t) && break
+        if waitee !== nothing
+            invokelatest(cancel_wait!, waitee, t, crequest)
+        elseif tid != 0
+            ccall(:jl_send_cancellation_signal, Cvoid, (Int16,), (tid - 1) % Int16)
+        end
     end
     if t.sticky
         # If this task is sticky, it won't be able to run if the task currently
@@ -1494,7 +1533,7 @@ function cancel!(t::Task, crequest=CANCEL_REQUEST_SAFE)
     end
 end
 
-function cancel_wait!(q::StickyWorkqueue, t::Task)
+function cancel_wait!(q::StickyWorkqueue, t::Task, @nospecialize(creq))
     # Tasks in the workqueue are runnable - we do not cancel the wait,
     # but we do need to check whether it's in there
     lock(q.lock)

From 0c85647707fd2de407bf863f095781e0983196ea Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliahub.com>
Date: Sat, 6 Dec 2025 08:16:28 +0000
Subject: [PATCH 9/9] Support write cancellation

---
 base/condition.jl  | 20 ++-----------
 base/stream.jl     | 70 ++++++++++++++++++++++++++++++++++++++--------
 base/task.jl       | 25 +++++++++++++++++
 deps/libuv.version |  4 +--
 src/jl_uv.c        |  4 +--
 src/jltypes.c      |  1 -
 6 files changed, 90 insertions(+), 34 deletions(-)

diff --git a/base/condition.jl b/base/condition.jl
index e3aa7a2dc59ed..c3520fdf379a9 100644
--- a/base/condition.jl
+++ b/base/condition.jl
@@ -146,24 +146,10 @@ function wait(c::GenericCondition; first::Bool=false, waitee=c, expected_cancell
     ct = current_task()
     _wait2(c, ct, waitee, first)
 
-@label before_barrier
-    # Synchronize with atomic_fence_heavy in cancel!
-    Threads.atomic_fence_light()
-    # We need to check if we were cancelled and should not suspend if we were.
-    # The fencing above ensures that we either see the cancellation request
-    # or the cancelling task will call cancel_wait! to wake us again.
-    cr = cancellation_request()
+    cr = pre_sleep_cancellation_request()
     if cr !== expected_cancellation
-        if cr === CANCEL_REQUEST_YIELD
-            # We are about to yield anyway, so we can acknowledge the cancellation now.
-            # However, for the integrity of the cancellation_request syncrhonization,
-            # we must revisit the barrier above and re-check the cancellation request
-            @atomicreplace :acquire_release :monotonic ct.cancellation_request cr => nothing
-            @goto before_barrier
-        else
-            Base.list_deletefirst!(waitqueue(c), ct)
-            return invokelatest(cancel_wait!, waitee, cr)
-        end
+        Base.list_deletefirst!(waitqueue(c), ct)
+        return invokelatest(cancel_wait!, waitee, cr)
     end
     token = unlockall(c.lock)
 
diff --git a/base/stream.jl b/base/stream.jl
index 7b227458ec552..f74b900fd81c3 100644
--- a/base/stream.jl
+++ b/base/stream.jl
@@ -1061,28 +1061,51 @@ end
 uv_write(s::LibuvStream, p::Vector{UInt8}) = GC.@preserve p uv_write(s, pointer(p), UInt(sizeof(p)))
 
 # caller must have acquired the iolock
-function uv_write(s::LibuvStream, p::Ptr{UInt8}, n::UInt)
-    uvw = uv_write_async(s, p, n)
+function uv_write_noncancel(s::LibuvStream, p::Ptr{UInt8}, n::UInt)
+    # Establish the wait object early so that if we get cancelled, we don't
+    # have to go to libuv in first place.
     ct = current_task()
+    ct.queue = s
+
+    cr = pre_sleep_cancellation_request()
+    if cr !== nothing
+        cr.queue = nothing
+        return 0
+    end
+
+    uvw = uv_write_async(s, p, n)
+    # TODO: If the request was split above, this is wrong
+    ct.next = uvw
+
     preserve_handle(ct)
     sigatomic_begin()
     uv_req_set_data(uvw, ct)
     iolock_end()
-    local status
+    local nwritten
     try
         sigatomic_end()
         # wait for the last chunk to complete (or error)
         # assume that any errors would be sticky,
         # (so we don't need to monitor the error status of the intermediate writes)
-        status = wait()::Cint
+        nwritten = wait()::Csize_t
         sigatomic_begin()
     finally
         # try-finally unwinds the sigatomic level, so need to repeat sigatomic_end
         sigatomic_end()
         iolock_begin()
-        q = ct.queue; q === nothing || Base.list_deletefirst!(q::IntrusiveLinkedList{Task}, ct)
+        if ct.queue === s
+            # Only happens in unexpected error cases. Cancellation queues a proper
+            # callback, which unsets this.
+            ct.next = nothing
+            ct.queue = nothing
+        end
         if uv_req_data(uvw) != C_NULL
-            # uvw is still alive,
+            # uvw is still alive - likely because we got some unexpected throwto
+            # exception. We try to cancel the request to avoid spamming if this
+            # is something the user is looking at. Note that cancellation does
+            # not go through this path and instead returns the number of written
+            # bytes to the caller.
+            ccall(:uv_cancel, Cint, (Ptr{Cvoid},), uvw) # Ignore errors
             # so make sure we won't get spurious notifications later
             uv_req_set_data(uvw, C_NULL)
         else
@@ -1092,10 +1115,27 @@ function uv_write(s::LibuvStream, p::Ptr{UInt8}, n::UInt)
         iolock_end()
         unpreserve_handle(ct)
     end
-    if status < 0
-        throw(_UVError("write", status))
+    return Int(nwritten)
+end
+
+function uv_write(s::LibuvStream, p::Ptr{UInt8}, n::UInt)
+    nb = uv_write_noncancel(s, p, n)
+    @cancel_check
+    @assert nb == n
+    return nb
+end
+
+function cancel_wait!(s::LibuvStream, t::Task, @nospecialize(creq))
+    iolock_begin()
+    if t.queue !== s
+        iolock_end()
+        return false
     end
-    return Int(n)
+    uvw = t.next
+    @assert uvw !== nothing && uvw != C_NULL
+    ccall(:uv_cancel, Cint, (Ptr{Cvoid},), uvw) # Ignore errors
+    iolock_end()
+    return true
 end
 
 # helper function for uv_write that returns the uv_write_t struct for the write
@@ -1111,7 +1151,7 @@ function uv_write_async(s::LibuvStream, p::Ptr{UInt8}, n::UInt)
                     Int32,
                     (Ptr{Cvoid}, Ptr{Cvoid}, UInt, Ptr{Cvoid}, Ptr{Cvoid}),
                     s, p, nwrite, uvw,
-                    @cfunction(uv_writecb_task, Cvoid, (Ptr{Cvoid}, Cint)))
+                    @cfunction(uv_writecb_task, Cvoid, (Ptr{Cvoid}, Cint, Csize_t)))
         if err < 0
             Libc.free(uvw)
             uv_error("write", err)
@@ -1188,12 +1228,18 @@ function write(s::LibuvStream, b::UInt8)
     return write(s, Ref{UInt8}(b))
 end
 
-function uv_writecb_task(req::Ptr{Cvoid}, status::Cint)
+function uv_writecb_task(req::Ptr{Cvoid}, status::Cint, nwritten::Csize_t)
     d = uv_req_data(req)
     if d != C_NULL
         uv_req_set_data(req, C_NULL) # let the Task know we got the writecb
         t = unsafe_pointer_to_objref(d)::Task
-        schedule(t, status)
+        t.next = nothing
+        t.queue = nothing
+        if status != 0 && status != UV_ECANCELED
+            schedule(t, _UVError("write", status); error=true)
+        else
+            schedule(t, nwritten)
+        end
     else
         # no owner for this req, safe to just free it
         Libc.free(req)
diff --git a/base/task.jl b/base/task.jl
index 65192ee44b91c..0d075f293f672 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -1463,6 +1463,31 @@ function cancellation_request_or_yield()
     end
 end
 
+"""
+    cancellation_request_or_yield()
+
+Like [`cancellation_request_or_yield`](@ref), but indicates the caller is about to sleep,
+so yield requests can be ignored. Additionally, contains necessary synchronization to
+ensure that either the cancellation request is visible, or that any potential
+cancellation task will see the wait object established by the caller.
+
+Precondition: The caller must have established a wait object in `current_task().queue`.
+"""
+function pre_sleep_cancellation_request()
+    #@assert (@atomic :monotonic current_task().queue) !== nothing
+
+    # Synchronize with atomic_fence_heavy in cancel!
+    Threads.atomic_fence_light()
+
+    while true
+        _cr = cancellation_request_raw()
+        cr = conform_cancellation_request(_cr)
+        cr !== CANCEL_REQUEST_YIELD && return cr
+        @atomicreplace :sequentially_consistent :monotonic current_task().cancellation_request _cr => nothing
+        # The caller is about to sleep, so we are permitted to ignore the yield request.
+    end
+end
+
 """
     Core.cancellation_point!()
 
diff --git a/deps/libuv.version b/deps/libuv.version
index f80cde8964237..57201f6a82210 100644
--- a/deps/libuv.version
+++ b/deps/libuv.version
@@ -5,5 +5,5 @@ LIBUV_JLL_NAME := LibUV
 
 ## source build
 LIBUV_VER := 2
-LIBUV_BRANCH=julia-uv2-1.48.0
-LIBUV_SHA1=b21d6d84e46f6c97ecbc8e4e8a8ea6ad98049ea8
+LIBUV_BRANCH=kf/julia-writecancel
+LIBUV_SHA1=89b5f68b38698a4151f35b10ba9380363614097a
diff --git a/src/jl_uv.c b/src/jl_uv.c
index e41b896320693..5a45f9ff998d6 100644
--- a/src/jl_uv.c
+++ b/src/jl_uv.c
@@ -677,14 +677,14 @@ JL_DLLEXPORT int jl_fs_close(uv_os_fd_t handle)
 }
 
 JL_DLLEXPORT int jl_uv_write(uv_stream_t *stream, const char *data, size_t n,
-                             uv_write_t *uvw, uv_write_cb writecb)
+                             uv_write_t *uvw, uv_write3_cb writecb)
 {
     uv_buf_t buf[1];
     buf[0].base = (char*)data;
     buf[0].len = n;
     JL_UV_LOCK();
     JL_SIGATOMIC_BEGIN();
-    int err = uv_write(uvw, stream, buf, 1, writecb);
+    int err = uv_write3(uvw, stream, buf, 1, NULL, 0, writecb);
     JL_UV_UNLOCK();
     JL_SIGATOMIC_END();
     return err;
diff --git a/src/jltypes.c b/src/jltypes.c
index bb5a0beb98494..e80bd6be644c8 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -3829,7 +3829,6 @@ void jl_init_types(void) JL_GC_DISABLED
                         0, 1, 6);
     XX(task);
     jl_value_t *listt = jl_new_struct(jl_uniontype_type, jl_task_type, jl_nothing_type);
-    jl_svecset(jl_task_type->types, 0, listt);
     // Set field 20 (metrics_enabled) as const
     // Set fields 8 (_state) and 24-27 (metric counters), 28 (cancellation_request) as atomic
     const static uint32_t task_constfields[1]  = { 0b00000000000010000000000000000000 };