From a66817061323bebd8a9053c90ce39cee616693bb Mon Sep 17 00:00:00 2001
From: Sergey Subbotin <ssubbotin@gmail.com>
Date: Sun, 12 Apr 2026 13:43:29 +0200
Subject: [PATCH 1/4] fix: register default global_scratch allocator on
 Blackwell GPUs

On Blackwell (SM 10.0+), the Triton compiler emits global_scratch
memory for autotuned kernels even when TMA is not used (FLA_USE_TMA=0).
Without an allocator registered, this causes NullAllocator crashes
during kernel autotuning, which corrupts CUDA synchronization state
and leads to process deadlocks.

The existing allocator registration only runs when IS_TMA_SUPPORTED
is True (requires FLA_USE_TMA=1). This change also registers the
allocator on Blackwell when TMA is disabled, since the compiler
still needs scratch space for other purposes on SM 10.0+.

Fixes deadlocks when running MoE+Mamba models (Qwen3-Coder-Next,
Qwen3.5) on Blackwell GPUs via vLLM.

See: triton-lang/triton#10002
---
 fla/utils.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/fla/utils.py b/fla/utils.py
index 4d9810d734..5330b29c7f 100644
--- a/fla/utils.py
+++ b/fla/utils.py
@@ -479,13 +479,18 @@ def map_triton_backend_to_torch_device() -> str:
     # This is a workaround for old nvidia card.
     os.environ['TRITON_F32_DEFAULT'] = 'ieee'
 
+def _default_alloc_fn(size: int, alignment: int, stream: int | None):
+    return torch.empty(size, device=torch.device(device_name, device_torch_lib.current_device()), dtype=torch.int8)
+
 if IS_TMA_SUPPORTED:
     logger.info('TMA is supported, using TMA by default.')
-
-    def alloc_fn(size: int, alignment: int, stream: int | None):
-        return torch.empty(size, device=torch.device(device_name, device_torch_lib.current_device()), dtype=torch.int8)
-
-    triton.set_allocator(alloc_fn)
+    triton.set_allocator(_default_alloc_fn)
+elif IS_NVIDIA and torch.cuda.get_device_capability(0)[0] >= 10:
+    # Blackwell (SM 10.0+): Triton compiler may emit global_scratch for
+    # autotuned kernels even without TMA. Register a default allocator to
+    # prevent NullAllocator crashes. See triton-lang/triton#10002.
+    logger.info('Blackwell detected: registering default global_scratch allocator.')
+    triton.set_allocator(_default_alloc_fn)
 
 
 def get_all_max_shared_mem():

From 27229be24bb1234ceedb3e19284af96332dd3f63 Mon Sep 17 00:00:00 2001
From: Sergey Subbotin <ssubbotin@gmail.com>
Date: Sun, 12 Apr 2026 13:46:13 +0200
Subject: [PATCH 2/4] style: fix autopep8 blank lines

---
 fla/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fla/utils.py b/fla/utils.py
index 5330b29c7f..0d1fe14cfb 100644
--- a/fla/utils.py
+++ b/fla/utils.py
@@ -479,9 +479,11 @@ def map_triton_backend_to_torch_device() -> str:
     # This is a workaround for old nvidia card.
     os.environ['TRITON_F32_DEFAULT'] = 'ieee'
 
+
 def _default_alloc_fn(size: int, alignment: int, stream: int | None):
     return torch.empty(size, device=torch.device(device_name, device_torch_lib.current_device()), dtype=torch.int8)
 
+
 if IS_TMA_SUPPORTED:
     logger.info('TMA is supported, using TMA by default.')
     triton.set_allocator(_default_alloc_fn)

From 84f040aea895c0cb0fbebdb151f32a304cffa2da Mon Sep 17 00:00:00 2001
From: Sergey Subbotin <ssubbotin@gmail.com>
Date: Sun, 12 Apr 2026 13:46:59 +0200
Subject: [PATCH 3/4] fix: use current device for capability check (review
 feedback)

---
 fla/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fla/utils.py b/fla/utils.py
index 0d1fe14cfb..3d68d06846 100644
--- a/fla/utils.py
+++ b/fla/utils.py
@@ -487,7 +487,7 @@ def _default_alloc_fn(size: int, alignment: int, stream: int | None):
 if IS_TMA_SUPPORTED:
     logger.info('TMA is supported, using TMA by default.')
     triton.set_allocator(_default_alloc_fn)
-elif IS_NVIDIA and torch.cuda.get_device_capability(0)[0] >= 10:
+elif IS_NVIDIA and torch.cuda.get_device_capability()[0] >= 10:
     # Blackwell (SM 10.0+): Triton compiler may emit global_scratch for
     # autotuned kernels even without TMA. Register a default allocator to
     # prevent NullAllocator crashes. See triton-lang/triton#10002.

From c426461cd283c8d5bbdc82086cdd6c2a5afc7069 Mon Sep 17 00:00:00 2001
From: Sergey Subbotin <ssubbotin@gmail.com>
Date: Mon, 13 Apr 2026 00:37:59 +0200
Subject: [PATCH 4/4] refactor: use IS_NVIDIA_BLACKWELL constant, update to >=
 10 (review feedback)

- Use shared IS_NVIDIA_BLACKWELL constant instead of inline capability check
- Change IS_NVIDIA_BLACKWELL from == 10 to >= 10 for forward compatibility
  with future NVIDIA architectures beyond Blackwell
- Addresses CodeRabbit and Gemini review feedback
---
 fla/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fla/utils.py b/fla/utils.py
index 3d68d06846..e7fd6d6748 100644
--- a/fla/utils.py
+++ b/fla/utils.py
@@ -464,7 +464,7 @@ def map_triton_backend_to_torch_device() -> str:
 IS_NVIDIA = (device_platform == 'cuda')
 IS_INTEL_ALCHEMIST = (IS_INTEL and 'Intel(R) Arc(TM) A' in torch.xpu.get_device_name(0))
 IS_NVIDIA_HOPPER = (IS_NVIDIA and ('NVIDIA H' in torch.cuda.get_device_name(0) or torch.cuda.get_device_capability()[0] >= 9))
-IS_NVIDIA_BLACKWELL = (IS_NVIDIA and torch.cuda.get_device_capability()[0] == 10)
+IS_NVIDIA_BLACKWELL = (IS_NVIDIA and torch.cuda.get_device_capability()[0] >= 10)
 USE_CUDA_GRAPH = (IS_NVIDIA and os.environ.get('FLA_USE_CUDA_GRAPH', '0') == '1')
 
 # Nvidia Ampere or newer, haven't check AMD and intel yet.
@@ -487,7 +487,7 @@ def _default_alloc_fn(size: int, alignment: int, stream: int | None):
 if IS_TMA_SUPPORTED:
     logger.info('TMA is supported, using TMA by default.')
     triton.set_allocator(_default_alloc_fn)
-elif IS_NVIDIA and torch.cuda.get_device_capability()[0] >= 10:
+elif IS_NVIDIA_BLACKWELL:
     # Blackwell (SM 10.0+): Triton compiler may emit global_scratch for
     # autotuned kernels even without TMA. Register a default allocator to
     # prevent NullAllocator crashes. See triton-lang/triton#10002.