From 2fb395e071bf155897e164f97c2f8db5666f1801 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Fri, 22 Aug 2025 22:21:12 +0000 Subject: [PATCH 1/3] Enable auto deploy tests Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- tests/integration/test_lists/test-db/l0_b200.yml | 2 ++ tests/integration/test_lists/test-db/l0_dgx_b200.yml | 2 ++ tests/integration/test_lists/test-db/l0_dgx_h100.yml | 2 ++ tests/integration/test_lists/test-db/l0_dgx_h200.yml | 2 ++ tests/integration/test_lists/test-db/l0_gb200.yml | 2 ++ tests/integration/test_lists/test-db/l0_h100.yml | 2 ++ 6 files changed, 12 insertions(+) diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 66cf676f2f7..5e188165a5b 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -69,6 +69,8 @@ l0_b200: - unittest/_torch/modeling -k "modeling_deepseek" - unittest/_torch/modeling -k "modeling_gpt_oss" - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" + # ------------- AutoDeploy tests --------------- + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index fb3f518a686..41645146cfa 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -89,3 +89,5 @@ l0_dgx_b200: - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend + # ------------- AutoDeploy tests --------------- + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 36fcdce5328..07a05e9b0db 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -61,6 +61,8 @@ l0_dgx_h100: - test_e2e.py::test_ptp_quickstart_advanced_bs1 - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8] - unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism + # ------------- AutoDeploy tests --------------- + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index 42667225456..82af5f19c0b 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -34,6 +34,8 @@ l0_dgx_h200: - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout] - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout] - unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora + # ------------- AutoDeploy tests --------------- + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml index 7d1cc92fef5..64ae3acfecf 100644 --- a/tests/integration/test_lists/test-db/l0_gb200.yml +++ b/tests/integration/test_lists/test-db/l0_gb200.yml @@ -36,6 +36,8 @@ l0_gb200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + # ------------- AutoDeploy tests --------------- + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 0263c452b3f..50575e730bb 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -102,6 +102,8 @@ l0_h100: - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-enable_request_rate] # negative test - test_e2e.py::test_trtllm_bench_help_sanity[meta-llama/Llama-3.1-8B] - test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True] + # ------------- AutoDeploy tests --------------- + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype - condition: ranges: system_gpu_count: From 161ac6b2227df89d3637558eb11e92de7fc74f4e Mon Sep 17 00:00:00 2001 From: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com> Date: Fri, 22 Aug 2025 23:18:01 -0700 Subject: [PATCH 2/3] do not sort idx when updating input_ids with new tokens Signed-off-by: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com> --- .../_torch/auto_deploy/custom_ops/attention_interface.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py index d486d93b83b..b6fc7b76937 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py @@ -476,9 +476,6 @@ def update_input_ids_with_new_tokens( idx = self.previous_batch_indices_cuda[: len(previous_batch_indices)] idx.copy_(host_idx, non_blocking=True) - # sort them so that masked_scatter_ lines up correctly - idx, _ = idx.sort() - # gather the exact values you want to write src = new_tokens[0, idx, 0] From 70ce26017d972493284bf8100216d46719db09dd Mon Sep 17 00:00:00 2001 From: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com> Date: Sun, 24 Aug 2025 08:02:58 -0700 Subject: [PATCH 3/3] revert changes to l0_gb200.yml Signed-off-by: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com> --- tests/integration/test_lists/test-db/l0_gb200.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml index 2b471975286..d500da6b659 100644 --- a/tests/integration/test_lists/test-db/l0_gb200.yml +++ b/tests/integration/test_lists/test-db/l0_gb200.yml @@ -49,7 +49,6 @@ l0_gb200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton] - - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551 - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]