NVIDIA · mikeiovine · Nov 20, 2025 · Oct 21, 2025 · Oct 22, 2025 · Oct 22, 2025
@@ -1059,7 +1059,6 @@ common-files: &common_files |
         tests/unittest/_torch/thop/parallel/test_logits_bitmask_op.py |
         tests/unittest/_torch/thop/parallel/test_mamba_conv1d_op.py |
         tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py |
-        tests/unittest/_torch/thop/parallel/test_moe.py |
         tests/unittest/_torch/thop/parallel/test_noaux_tc.py |
         tests/unittest/_torch/thop/parallel/test_scaled_mm.py |
         tests/unittest/_torch/thop/parallel/test_selective_scan_op.py |
@@ -1071,6 +1070,7 @@ common-files: &common_files |
         tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py |
         tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py |
         tests/unittest/_torch/thop/serial/test_moe_alltoall.py |
+        tests/unittest/_torch/thop/serial/test_moe.py |
         tests/unittest/api_stability/api_stability_core.py |
         tests/unittest/api_stability/test_llm_api.py |
         tests/unittest/bindings/binding_test_utils.py |

@@ -2145,7 +2145,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(
             return 0;
         }
 
-        auto const numCurrTokens = mSequences.at(req.mRequestId).getNumTokens();
+        auto const numCurrTokens = getSequence(req.mRequestId).getNumTokens();
         auto const generatedTokens = numCurrTokens - req.getPromptLen();
         auto const maxTokensToAddToKVCache = req.mMaxNewTokens - generatedTokens;
         auto const tokensPerStep = req.getNumDraftTokens() + 1;
@@ -2409,7 +2409,13 @@ void KVCacheManager::addSequence(
 void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest)
 {
     auto const requestId = llmRequest.mRequestId;
-    if (mSequences.find(requestId) != mSequences.end())
+    bool found = false;
+    {
+        // protect the mSequences
+        std::scoped_lock lock(mSequencesMtx);
+        found = mSequences.find(requestId) != mSequences.end();
+    }
+    if (found)
     {
         auto& sequence = getSequence(requestId);
         if (mEnableBlockReuse && !llmRequest.isDummyRequest())

@@ -2209,6 +2209,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
         def noIsolateTests = false
         def rerunFailed = false
 
+        echoNodeAndGpuInfo(pipeline, stageName)
+        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
+
+        def extraInternalEnv = ""
+        def pytestTestTimeout = "3600"
+
+        // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
+        extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
+        // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
+        extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
+        // Enable NCCL debug information for multi-GPU tests
+        extraInternalEnv += " NCCL_DEBUG=INFO"
+
         def testDBList = renderTestDB(testList, llmSrc, stageName)
 
         // Process shard test list and create separate files for regular and isolate tests

@@ -1099,7 +1099,6 @@ exclude = [
     "tests/unittest/_torch/thop/parallel/test_logits_bitmask_op.py",
     "tests/unittest/_torch/thop/parallel/test_mamba_conv1d_op.py",
     "tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py",
-    "tests/unittest/_torch/thop/parallel/test_moe.py",
     "tests/unittest/_torch/thop/parallel/test_noaux_tc.py",
     "tests/unittest/_torch/thop/parallel/test_scaled_mm.py",
     "tests/unittest/_torch/thop/parallel/test_selective_scan_op.py",
@@ -1111,6 +1110,7 @@ exclude = [
     "tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py",
     "tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py",
     "tests/unittest/_torch/thop/serial/test_moe_alltoall.py",
+    "tests/unittest/_torch/thop/serial/test_moe.py",
     "tests/unittest/api_stability/api_stability_core.py",
     "tests/unittest/api_stability/test_llm_api.py",
     "tests/unittest/bindings/binding_test_utils.py",

@@ -51,9 +51,7 @@ def __init__(
         self.capture_num_tokens = sorted(capture_num_tokens or [])
         self.piecewise_cuda_graph = enable_piecewise_cuda_graph
         self.no_optimization = False
-        # We only need to create aux streams.
-        self.aux_streams = Backend.Streams(
-            [torch.cuda.Stream() for _ in range(max_num_streams - 1)])
+        self.num_streams = max_num_streams
         self.events = Backend.Events()
         inductor_config.enable_auto_functionalized_v2 = False
 
@@ -109,10 +107,8 @@ def optimize(
         # Do not apply multi-stream if enable piecewise cuda graph or inductor
         # For piecewise cuda graph, we will apply the multi-stream optimization in piecewise_optimizer
         # For inductor, we do not control the passes inside inductor.
-        if len(
-                self.aux_streams
-        ) > 0 and not self.piecewise_cuda_graph and not self.enable_inductor:
-            num_events = multi_stream_schedule(gm, len(self.aux_streams) + 1)
+        if self.num_streams > 1 and not self.piecewise_cuda_graph and not self.enable_inductor:
+            num_events = multi_stream_schedule(gm, self.num_streams)
             self.generate_events(num_events)
 
         gm.recompile()
@@ -125,7 +121,7 @@ def optimize(
                 self.input_num_tokens,
                 self.capture_num_tokens,
                 self._graph_pool_handle,
-                len(self.aux_streams) + 1,
+                self.num_streams,
             )
             self.generate_events(num_events)
             return gm

@@ -405,8 +405,8 @@ def tp_broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024):
     def pp_allgather(self, obj):
         return self.pp_comm.allgather(obj)
 
-    def pp_gather(self, obj):
-        return self.pp_comm.gather(obj)
+    def pp_gather(self, obj, root=0):
+        return self.pp_comm.gather(obj, root=root)
 
     def pp_broadcast(self, obj, root=0):
         return self.pp_comm.bcast(obj, root)

@@ -5,6 +5,7 @@
 
 from tensorrt_llm._torch.modules.qk_norm_attention import QKNormRoPEAttention
 from tensorrt_llm.functional import PositionEmbeddingType
+from tensorrt_llm.quantization import QuantAlgo
 
 from ..attention_backend import AttentionMetadata
 from ..attention_backend.interface import (PositionalEmbeddingParams,
@@ -54,7 +55,8 @@ class Exaone4Attention(QKNormRoPEAttention):
     def __init__(self,
                  model_config: ModelConfig[Exaone4Config],
                  layer_idx: Optional[int] = None,
-                 fuse_qk_norm_rope: bool = False):
+                 fuse_qk_norm_rope: bool = False,
+                 disable_deep_gemm: bool = False):
         config = model_config.pretrained_config
 
         self.attention_window_size = None
@@ -88,6 +90,7 @@ def __init__(self,
             layer_idx=layer_idx,
             dtype=config.torch_dtype,
             config=model_config,
+            disable_deep_gemm=disable_deep_gemm,
         )
 
     def forward(
@@ -128,9 +131,17 @@ def __init__(
         self.is_quanted = model_config.quant_config and model_config.quant_config.quant_mode.has_any_quant(
         )
 
+        disable_deep_gemm = False
+        quant_config = getattr(model_config, "quant_config", None)
+        if quant_config is not None:
+            # EXAONE4 fp8 has an illegal memory access issue with deep_gemm.
+            disable_deep_gemm = getattr(quant_config, "quant_algo",
+                                        None) == QuantAlgo.FP8_BLOCK_SCALES
+
         self.self_attn = Exaone4Attention(
             model_config,
             layer_idx=layer_idx,
+            disable_deep_gemm=disable_deep_gemm,
         )
 
         self.mlp = GatedMLP(
@@ -140,6 +151,7 @@ def __init__(
             dtype=config.torch_dtype,
             config=model_config,
             layer_idx=layer_idx,
+            disable_deep_gemm=disable_deep_gemm,
         )
 
         self.post_attention_layernorm = RMSNorm(hidden_size=config.hidden_size,

@@ -600,7 +600,7 @@ def forward(
                         ))
 
                 # Unpack the allreduce output
-                if self.next_attn is not None and self.is_nvfp4:
+                if self.post_feed_forward_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
                     act_fp4, act_sf, residual = allreduce_output
                     hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
                 else:
@@ -791,7 +791,7 @@ def forward(
                         scale=scale,
                         eps=self.next_layer_layernorm.variance_epsilon,
                     ))
-                if self.next_attn is not None and self.is_nvfp4:
+                if self.post_mlp_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
                     act_fp4, act_sf, residual = all_reduce_output
                     hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
                 else:

@@ -268,6 +268,10 @@ def __init__(
                 use_ub = not use_ub_for_nccl and (
                     torch_compile_enable_userbuffers
                     and self._init_userbuffers(self.model.config.hidden_size))
+                self.backend_num_streams = Backend.Streams([
+                    torch.cuda.Stream()
+                    for _ in range(torch_compile_max_num_streams - 1)
+                ])
                 self._torch_compile_backend = Backend(
                     torch_compile_inductor_enabled,
                     enable_userbuffers=use_ub,
@@ -2658,8 +2662,7 @@ def model_forward(self, **kwargs):
         if self._torch_compile_backend is not None:
             # Register aux streams and events to model extra attrs.
             # The streams and events are list which could be updated during compilation.
-            attrs["aux_streams"] = weakref.ref(
-                self._torch_compile_backend.aux_streams)
+            attrs["aux_streams"] = weakref.ref(self.backend_num_streams)
             attrs["events"] = weakref.ref(self._torch_compile_backend.events)
             attrs["global_stream"] = torch.cuda.current_stream()