NVIDIA · suyoggupta · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
@@ -163,6 +163,9 @@ transforms:
   ############################################################################################
   fuse_causal_conv_activation:
     stage: compile
+  multi_stream_moe:
+    stage: compile
+    enabled: false
   compile_model:
     stage: compile
     run_per_gm: false

diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -181,6 +181,10 @@ def get_default_kwargs(self):
                     "sharding_source": ['factory', 'heuristic'],
                     "sharding_dims": ['ep', 'bmm'],
                 },
+                "multi_stream_moe": {
+                    "stage": "compile",
+                    "enabled": True,
+                },
                 # NOTE: some accuracy benchmarks may require fp32 precision for mamba cache
                 # "insert_cached_ssm_attention": {
                 #     "cache_config": {
@@ -201,6 +205,9 @@ def get_default_sampling_params(self):
     @pytest.mark.skip_less_device_memory(32000)
     def test_bf16(self):
         kwargs = self.get_default_kwargs()
+        # TODO: multi-stream MOE seems to increase the memory usage
+        kwargs["max_batch_size"] = 32
+        kwargs["free_mem_ratio"] = 0.5
         sampling_params = self.get_default_sampling_params()
         with AutoDeployLLM(model=self.MODEL_PATH_BF16,
                            tokenizer=self.MODEL_PATH_BF16,