nv-auto-deploy · lucaslie · Sep 25, 2025 · Sep 25, 2025
diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py b/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py
@@ -446,6 +446,25 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
             "vision_config": {"num_hidden_layers": 2},
         },
     },
+    "ibm-ai-platform/Bamba-9B-v2": {
+        "llm_models_subdir": "NO_SUBDIR",  # no synced to llm_models_root at the moment
+        "model_kwargs": {
+            "torch_dtype": "bfloat16",
+            "hidden_size": 64,
+            "intermediate_size": 128,
+            "mamba_chunk_size": 64,
+            "mamba_d_conv": 2,
+            "mamba_d_head": 16,
+            "mamba_d_state": 64,
+            "mamba_expand": 1,
+            "mamba_n_groups": 1,
+            "mamba_n_heads": 4,
+            "model_type": "bamba",
+            "num_hidden_layers": 10,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+        },
+    },
 }
 
 

diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_bamba.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_bamba.py
@@ -2,10 +2,10 @@
 import torch.export as te
 from torch.export import Dim  # noqa
 
-# import pytest
 from tensorrt_llm._torch.auto_deploy.export import apply_export_patches, torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.llm_args import AutoDeployConfig
 from tensorrt_llm._torch.auto_deploy.transformations._graph import move_to_device  # noqa
+from _model_test_utils import get_small_model_config
 
 MODEL_DIR = "ibm-ai-platform/Bamba-9B-v2"
 # NOTE: find example inputs with the same tokenization length to avoid seq concat.
@@ -14,43 +14,36 @@
 EXAMPLE_INPUT2 = "Tiger is a cat with the following properties:"
 
 
-# @pytest.mark.parametrize(
-#     "model_on_meta_during_export",
-#     [
-#         True,
-#         False,
-#     ],
-# )
-# @pytest.mark.parametrize(
-#     "export_func",
-#     [
-#         "torch_export_to_gm",
-#         "torch_export",
-#     ],
-# )
-def test_bamba_patches(
-    model_on_meta_during_export: bool = True,
-    export_func: str = "torch_export_to_gm",
-    use_cache: bool = True,
-):
-    llm_args = AutoDeployConfig(
-        **{
+def test_bamba_patches():
+    model_on_meta_during_export = True
+    export_func: str = "torch_export_to_gm"
+    use_cache: bool = True
+
+    # NOTE: set to False if you want to locally test the full model
+    use_small_config: bool = True
+
+    common_kwargs = {
+        "world_size": 0,
+        "runtime": "demollm",
+        "compile_backend": "torch-simple",
+        "attn_backend": "flashinfer",
+        "model_factory": "AutoModelForCausalLM",
+        "max_seq_len": 512,
+    }
+
+    if use_small_config:
+        llm_args = get_small_model_config(MODEL_DIR, **common_kwargs)["args"]
+        llm_args["model_kwargs"]["use_cache"] = use_cache
+    else:
+        llm_args = {
             "model": MODEL_DIR,
-            "world_size": 0,
-            "runtime": "demollm",
-            "compile_backend": "torch-simple",
-            "attn_backend": "flashinfer",
-            "model_factory": "AutoModelForCausalLM",
+            **common_kwargs,
             "model_kwargs": {
-                # "use_cache": True,
                 "use_cache": use_cache,
                 "torch_dtype": "bfloat16",
-                # "num_hidden_layers": 10,
             },
-            "max_seq_len": 512,
-            "skip_loading_weights": False,
-        },
-    )
+        }
+    llm_args = AutoDeployConfig(**llm_args)
 
     torch.manual_seed(0)
     if torch.cuda.is_available():
@@ -85,8 +78,7 @@ def _run_torch_export_to_gm():
             dynamic_shapes=dynamic_shapes,
             patch_list=[
                 "bamba",
-                # For "unsupported scalarType".
-                "autocast_noop",
+                "autocast_noop",  # For "unsupported scalarType".
             ],
         )
 
@@ -112,18 +104,14 @@ def _run_export():
         gm = _run_export()
         factory.load_or_random_init(gm, device="cuda")
         move_to_device(gm, "cuda")
-
-    factory.load_or_random_init(model, device="cuda")
-
-    _verify_generation(factory, model, tokenizer)
-    # return
-
-    print("====== EXPORTING GRAPH MODULE ======")
-    if not model_on_meta_during_export:
+        factory._to_maybe_random(model, "cuda")
+        model.load_state_dict(gm.state_dict())
+    else:
+        factory.load_or_random_init(model, device="cuda")
         gm = _run_export()
         move_to_device(gm, "cuda")
 
-    gm.model.A_log = model.model.A_log
+    _verify_generation(factory, model, tokenizer)
 
     # let's do a comparison of every state dict item between the model and the gm
     torch.testing.assert_close(model.state_dict(), gm.state_dict(), rtol=0.0, atol=0.0)
@@ -142,20 +130,12 @@ def _run_export():
     atol, rtol = 1e-3, 1e-3
     for comp, outs in outputs_for_comparison.items():
         print(f"====== COMPARISON ({comp}) ======")
-        try:
-            torch.testing.assert_close(
-                outs,
-                out_original,
-                rtol=rtol,
-                atol=atol,
-            )
-            print("Passed!")
-        except AssertionError as e:
-            print(e)
-            diff = torch.abs(outs.logits - out_original.logits)
-            print(f"abs diff: {diff}")
-            print(f"average diff: {diff.mean()}")
-            print(f"{comp=}")
+        torch.testing.assert_close(
+            outs,
+            out_original,
+            rtol=rtol,
+            atol=atol,
+        )
 
 
 def _verify_generation(factory, model, tokenizer):
@@ -181,17 +161,5 @@ def _generate(tokenizer, model):
     print("\n".join(tokenizer.batch_decode(response, skip_special_tokens=True)))
 
 
-# def _get_example_inputs(llm_args, factory, device):
-#     batch_size = min(2, llm_args.max_batch_size)
-#     seq_len = min(4, llm_args.max_seq_len)
-#     inputs = {"input_ids": torch.ones(batch_size, seq_len, dtype=torch.int)}
-#     for key, value in inputs.items():
-#         if isinstance(value, torch.Tensor):
-#             dtype = torch.bfloat16 if isinstance(value, torch.FloatTensor) else None
-#             inputs[key] = value.to(device=device, dtype=dtype)
-#
-#     return inputs
-
-
 if __name__ == "__main__":
     test_bamba_patches()