From a66817061323bebd8a9053c90ce39cee616693bb Mon Sep 17 00:00:00 2001 From: Sergey Subbotin Date: Sun, 12 Apr 2026 13:43:29 +0200 Subject: [PATCH 1/4] fix: register default global_scratch allocator on Blackwell GPUs On Blackwell (SM 10.0+), the Triton compiler emits global_scratch memory for autotuned kernels even when TMA is not used (FLA_USE_TMA=0). Without an allocator registered, this causes NullAllocator crashes during kernel autotuning, which corrupts CUDA synchronization state and leads to process deadlocks. The existing allocator registration only runs when IS_TMA_SUPPORTED is True (requires FLA_USE_TMA=1). This change also registers the allocator on Blackwell when TMA is disabled, since the compiler still needs scratch space for other purposes on SM 10.0+. Fixes deadlocks when running MoE+Mamba models (Qwen3-Coder-Next, Qwen3.5) on Blackwell GPUs via vLLM. See: triton-lang/triton#10002 --- fla/utils.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fla/utils.py b/fla/utils.py index 4d9810d734..5330b29c7f 100644 --- a/fla/utils.py +++ b/fla/utils.py @@ -479,13 +479,18 @@ def map_triton_backend_to_torch_device() -> str: # This is a workaround for old nvidia card. os.environ['TRITON_F32_DEFAULT'] = 'ieee' +def _default_alloc_fn(size: int, alignment: int, stream: int | None): + return torch.empty(size, device=torch.device(device_name, device_torch_lib.current_device()), dtype=torch.int8) + if IS_TMA_SUPPORTED: logger.info('TMA is supported, using TMA by default.') - - def alloc_fn(size: int, alignment: int, stream: int | None): - return torch.empty(size, device=torch.device(device_name, device_torch_lib.current_device()), dtype=torch.int8) - - triton.set_allocator(alloc_fn) + triton.set_allocator(_default_alloc_fn) +elif IS_NVIDIA and torch.cuda.get_device_capability(0)[0] >= 10: + # Blackwell (SM 10.0+): Triton compiler may emit global_scratch for + # autotuned kernels even without TMA. Register a default allocator to + # prevent NullAllocator crashes. See triton-lang/triton#10002. + logger.info('Blackwell detected: registering default global_scratch allocator.') + triton.set_allocator(_default_alloc_fn) def get_all_max_shared_mem(): From 27229be24bb1234ceedb3e19284af96332dd3f63 Mon Sep 17 00:00:00 2001 From: Sergey Subbotin Date: Sun, 12 Apr 2026 13:46:13 +0200 Subject: [PATCH 2/4] style: fix autopep8 blank lines --- fla/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fla/utils.py b/fla/utils.py index 5330b29c7f..0d1fe14cfb 100644 --- a/fla/utils.py +++ b/fla/utils.py @@ -479,9 +479,11 @@ def map_triton_backend_to_torch_device() -> str: # This is a workaround for old nvidia card. os.environ['TRITON_F32_DEFAULT'] = 'ieee' + def _default_alloc_fn(size: int, alignment: int, stream: int | None): return torch.empty(size, device=torch.device(device_name, device_torch_lib.current_device()), dtype=torch.int8) + if IS_TMA_SUPPORTED: logger.info('TMA is supported, using TMA by default.') triton.set_allocator(_default_alloc_fn) From 84f040aea895c0cb0fbebdb151f32a304cffa2da Mon Sep 17 00:00:00 2001 From: Sergey Subbotin Date: Sun, 12 Apr 2026 13:46:59 +0200 Subject: [PATCH 3/4] fix: use current device for capability check (review feedback) --- fla/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fla/utils.py b/fla/utils.py index 0d1fe14cfb..3d68d06846 100644 --- a/fla/utils.py +++ b/fla/utils.py @@ -487,7 +487,7 @@ def _default_alloc_fn(size: int, alignment: int, stream: int | None): if IS_TMA_SUPPORTED: logger.info('TMA is supported, using TMA by default.') triton.set_allocator(_default_alloc_fn) -elif IS_NVIDIA and torch.cuda.get_device_capability(0)[0] >= 10: +elif IS_NVIDIA and torch.cuda.get_device_capability()[0] >= 10: # Blackwell (SM 10.0+): Triton compiler may emit global_scratch for # autotuned kernels even without TMA. Register a default allocator to # prevent NullAllocator crashes. See triton-lang/triton#10002. From c426461cd283c8d5bbdc82086cdd6c2a5afc7069 Mon Sep 17 00:00:00 2001 From: Sergey Subbotin Date: Mon, 13 Apr 2026 00:37:59 +0200 Subject: [PATCH 4/4] refactor: use IS_NVIDIA_BLACKWELL constant, update to >= 10 (review feedback) - Use shared IS_NVIDIA_BLACKWELL constant instead of inline capability check - Change IS_NVIDIA_BLACKWELL from == 10 to >= 10 for forward compatibility with future NVIDIA architectures beyond Blackwell - Addresses CodeRabbit and Gemini review feedback --- fla/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fla/utils.py b/fla/utils.py index 3d68d06846..e7fd6d6748 100644 --- a/fla/utils.py +++ b/fla/utils.py @@ -464,7 +464,7 @@ def map_triton_backend_to_torch_device() -> str: IS_NVIDIA = (device_platform == 'cuda') IS_INTEL_ALCHEMIST = (IS_INTEL and 'Intel(R) Arc(TM) A' in torch.xpu.get_device_name(0)) IS_NVIDIA_HOPPER = (IS_NVIDIA and ('NVIDIA H' in torch.cuda.get_device_name(0) or torch.cuda.get_device_capability()[0] >= 9)) -IS_NVIDIA_BLACKWELL = (IS_NVIDIA and torch.cuda.get_device_capability()[0] == 10) +IS_NVIDIA_BLACKWELL = (IS_NVIDIA and torch.cuda.get_device_capability()[0] >= 10) USE_CUDA_GRAPH = (IS_NVIDIA and os.environ.get('FLA_USE_CUDA_GRAPH', '0') == '1') # Nvidia Ampere or newer, haven't check AMD and intel yet. @@ -487,7 +487,7 @@ def _default_alloc_fn(size: int, alignment: int, stream: int | None): if IS_TMA_SUPPORTED: logger.info('TMA is supported, using TMA by default.') triton.set_allocator(_default_alloc_fn) -elif IS_NVIDIA and torch.cuda.get_device_capability()[0] >= 10: +elif IS_NVIDIA_BLACKWELL: # Blackwell (SM 10.0+): Triton compiler may emit global_scratch for # autotuned kernels even without TMA. Register a default allocator to # prevent NullAllocator crashes. See triton-lang/triton#10002.