[None][fix] Layer wise benchmarks: use local models, lint (#8799)

yuantailing · web-flow · commit ec31363a866a · 2025-10-30T09:47:46.000-07:00
Signed-off-by: Tailing Yuan &lt;yuantailing@gmail.com&gt;
diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md
@@ -42,7 +42,7 @@ NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --batch-size 32 --seq-len-q
 NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --layer-indices 5,6,7,8
 NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --layer-indices 5,6,7,8
 
-# Scale DEP=16 MNNVL to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable
+# Scale DEP=16 to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable
 NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP
 
 # Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
diff --git a/examples/layer_wise_benchmarks/slurm_init_containers.sh b/examples/layer_wise_benchmarks/slurm_init_containers.sh
@@ -22,6 +22,7 @@ if [ "${CONTAINER_IMAGE:-}" == "" ]; then
         DOCKER_IMAGE=$LLM_SBSA_DOCKER_IMAGE
     else
         echo "Unsupported machine hardware name \"$MACHINE\""
+        exit 1
     fi
 
     # Change "urm.nvidia.com/sw-tensorrt-docker/..." to "urm.nvidia.com#sw-tensorrt-docker/..." to bypass credentials
diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py b/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py
@@ -48,7 +48,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.world_size = mpi_world_size()
         self.rank = mpi_rank()
-        self.balance_method = None
+        self.balance_method = BalanceMethod.NotModified
         self.balance_ratio = None
 
     def apply(self, router_logits) -> (torch.Tensor, torch.Tensor):
@@ -276,7 +276,7 @@ def create_run_pack(self,
                         batch_size: int,
                         seq_len_q: int,
                         seq_len_kv_cache: int,
-                        kv_cache_manager: Optional[KVCacheManager] = None,
+                        kv_cache_manager: KVCacheManager,
                         attn_workspace: Optional[torch.Tensor] = None):
         if self.model_config.moe_backend == "TRTLLM" and os.getenv(
                 "TRTLLM_ENABLE_PDL") != "1":
@@ -292,11 +292,11 @@ def create_run_pack(self,
             max_num_requests=kv_cache_manager.max_batch_size,
             num_contexts={
                 "CTX": batch_size,
-                "GEN": 0
+                "GEN": 0,
             }[run_type],
             prompt_lens=[{
                 "CTX": seq_len_q,
-                "GEN": seq_len_kv_cache
+                "GEN": seq_len_kv_cache,
             }[run_type]] * batch_size,
             max_num_tokens=batch_size * seq_len_q,
             kv_cache_manager=kv_cache_manager,
@@ -380,7 +380,7 @@ def create_kv_cache_manager(pretrained_model_name_or_path, mapping,
             mapping=mapping,
             dtype=torch_dtype_to_binding({
                 None: torch.bfloat16,
-                "FP8": torch.float8_e4m3fn
+                "FP8": torch.float8_e4m3fn,
             }[model_config.quant_config.kv_cache_quant_algo]),
             sparse_attn_config=model_config.sparse_attention_config,
         )
diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -626,11 +626,25 @@ def deepseek_v3_model_root(request):
     elif request.param == "DeepSeek-V3-Lite-nvfp4_moe_only":
         deepseek_v3_model_root = os.path.join(models_root, "DeepSeek-V3-Lite",
                                               "nvfp4_moe_only")
+    elif request.param == "DeepSeek-V3.2-Exp":
+        deepseek_v3_model_root = os.path.join(models_root,
+                                              "DeepSeek-V3.2-Exp-hf")
     assert exists(
         deepseek_v3_model_root), f"{deepseek_v3_model_root} does not exist!"
     return deepseek_v3_model_root
 
 
+@pytest.fixture(scope="function")
+def deepseek_r1_model_root(request):
+    models_root = llm_models_root()
+    if request.param == "DeepSeek-R1-0528-FP4-v2":
+        deepseek_r1_model_root = os.path.join(models_root, "DeepSeek-R1",
+                                              "DeepSeek-R1-0528-FP4-v2")
+    assert exists(
+        deepseek_r1_model_root), f"{deepseek_r1_model_root} does not exist!"
+    return deepseek_r1_model_root
+
+
 @pytest.fixture(scope="session")
 def trt_performance_cache_name():
     return "performance.cache"
diff --git a/tests/unittest/tools/test_layer_wise_benchmarks.py b/tests/unittest/tools/test_layer_wise_benchmarks.py
@@ -1,32 +1,23 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import os
 
 import pytest
 import torch
+from defs.conftest import deepseek_r1_model_root  # noqa: F401
+from defs.conftest import deepseek_v3_model_root  # noqa: F401
 from defs.trt_test_alternative import check_call
 from utils.cpp_paths import llm_root  # noqa: F401
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
-def test_deepseek_r1_ctx_tep(llm_root):
+@pytest.mark.parametrize("deepseek_r1_model_root", ["DeepSeek-R1-0528-FP4-v2"],
+                         indirect=True)
+def test_deepseek_r1_ctx_tep(llm_root, deepseek_r1_model_root):
     check_call([
         "./mpi_launch.sh",
         "./run_single.sh",
         "config_ctx.yaml",
+        "--model=" + deepseek_r1_model_root,
         "--no-enable-attention-dp",
         "--moe-backend=TRTLLM",
     ],
@@ -40,12 +31,14 @@ def test_deepseek_r1_ctx_tep(llm_root):
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
-def test_deepseek_v32_ctx_dep(llm_root):
+@pytest.mark.parametrize("deepseek_v3_model_root", ["DeepSeek-V3.2-Exp"],
+                         indirect=True)
+def test_deepseek_v32_ctx_dep(llm_root, deepseek_v3_model_root):
     check_call([
         "./mpi_launch.sh",
         "./run_single.sh",
         "config_ctx.yaml",
-        "--model=deepseek-ai/DeepSeek-V3.2-Exp",
+        "--model=" + deepseek_v3_model_root,
         "--tokens-per-block=64",
         "--moe-backend=DEEPGEMM",
     ],
@@ -58,14 +51,17 @@ def test_deepseek_v32_ctx_dep(llm_root):
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
-def test_deepseek_r1_gen_scaled_from_16_dep(llm_root):
+@pytest.mark.parametrize("deepseek_r1_model_root", ["DeepSeek-R1-0528-FP4-v2"],
+                         indirect=True)
+def test_deepseek_r1_gen_scaled_from_16_dep(llm_root, deepseek_r1_model_root):
     check_call([
         "./mpi_launch.sh",
         "./run_single.sh",
         "config_gen.yaml",
+        "--model=" + deepseek_r1_model_root,
+        "--layer-indices=5,6",
         "--scaled-from=16",
         "--moe-backend=WIDEEP",
-        "--layer-indices=5,6",
     ],
                cwd=llm_root / "examples" / "layer_wise_benchmarks",
                env={