From 8092ae42bf1ee9deba5f6071293e7737eb2fc867 Mon Sep 17 00:00:00 2001 From: Nathaniel Levin Date: Wed, 4 Feb 2026 04:03:26 +0000 Subject: [PATCH 1/3] Add 1k1k STP and MTP disagg H100 configs --- .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml | 105 +++++++++++++++ .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml | 109 +++++++++++++++ .../ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml | 103 ++++++++++++++ .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml | 101 ++++++++++++++ ...> ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml} | 81 +++++------ .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 98 ++++++++++++++ .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 98 ++++++++++++++ .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml | 102 ++++++++++++++ .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 99 ++++++++++++++ .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml | 97 +++++++++++++ .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml | 99 ++++++++++++++ .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml | 95 +++++++++++++ .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml | 96 +++++++++++++ .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml | 94 +++++++++++++ .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 92 +++++++++++++ .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 92 +++++++++++++ .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 73 ++++------ .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml | 127 ++++++++++++++++++ 18 files changed, 1665 insertions(+), 96 deletions(-) create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml rename recipes/trtllm/h100-fp8/1k1k/mtp/{ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml => ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml} (65%) create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml new file mode 100644 index 00000000..aa34802b --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml @@ -0,0 +1,105 @@ +name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '615' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml new file mode 100644 index 00000000..12a1004e --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml @@ -0,0 +1,109 @@ +name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '1229' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml new file mode 100644 index 00000000..3c729e60 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml @@ -0,0 +1,103 @@ +name: h100_1k1k_ctx1dep16_gen2dep16_batch16_eplb0_mtp2 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 2 + decode_nodes: 4 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '616' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml new file mode 100644 index 00000000..51ff2cfa --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml @@ -0,0 +1,101 @@ +name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '231' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml similarity index 65% rename from recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml rename to recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml index b0ef1feb..af783663 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml @@ -1,43 +1,34 @@ -name: ctx1_gen2_dep16_batch16_eplb0_mtp3 - +name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3 model: - path: "dsr1-fp8" + path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" - precision: "fp8" - + precision: fp8 resources: - gpu_type: "h100" - prefill_nodes: 2 + gpu_type: h100 prefill_workers: 1 - - decode_workers: 2 - decode_nodes: 4 - + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 gpus_per_node: 8 - backend: type: trtllm - prefill_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP decode_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n trtllm_config: prefill: max_batch_size: 2 @@ -63,12 +54,11 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 - enable_attention_dp: true - enable_lm_head_tp_in_adp: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false pipeline_parallel_size: 1 max_batch_size: 16 max_num_tokens: 256 @@ -87,7 +77,7 @@ backend: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTLASS use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 8192 @@ -97,23 +87,14 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - - benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: ['616'] - req_rate: "inf" - + concurrencies: '60' + req_rate: inf frontend: - nginx_container: "nginx-sqsh" - type: "dynamo" - - -health_check: - max_attempts: 360 - interval_seconds: 10 - + type: dynamo + enable_multiple_frontends: false dynamo: - install: false \ No newline at end of file + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml new file mode 100644 index 00000000..c367a730 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -0,0 +1,98 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml new file mode 100644 index 00000000..1a7b8833 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -0,0 +1,98 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '9' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml new file mode 100644 index 00000000..4bf6a5f2 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml @@ -0,0 +1,102 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '117' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..70600e72 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,99 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '30' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..2f2a57fd --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,97 @@ +name: ctx1dep16_gen3dep16_batch16_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '924' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..774db1e8 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml @@ -0,0 +1,99 @@ +name: ctx1dep16_gen3dep16_batch32_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '1845' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml new file mode 100644 index 00000000..fd63b7a1 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml @@ -0,0 +1,95 @@ +name: ctx1dep16_gen3dep16_batch4_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '231' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml new file mode 100644 index 00000000..bcf511b9 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml @@ -0,0 +1,96 @@ +name: ctx1dep16_gen3dep16_batch8_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '462' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..b7f98f34 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,94 @@ +name: ctx1dep16_gen3tep16_batch16_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '60' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml new file mode 100644 index 00000000..a0510f6e --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -0,0 +1,92 @@ +name: ctx1dep16_gen3tep16_batch1_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml new file mode 100644 index 00000000..b46e49a8 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -0,0 +1,92 @@ +name: ctx1dep16_gen3tep16_batch2_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '9' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml index 5b85a6ff..d83994ab 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -1,43 +1,34 @@ -name: ctx1_gen3_tep16_batch8_eplb0_mtp0 - +name: ctx1dep16_gen3tep16_batch8_eplb0_mtp0 model: - path: "dsr1-fp8" + path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" - precision: "fp8" - + precision: fp8 resources: - gpu_type: "h100" - prefill_nodes: 2 + gpu_type: h100 prefill_workers: 1 - - decode_workers: 2 + prefill_nodes: 2 + decode_workers: 3 decode_nodes: 6 - gpus_per_node: 8 - backend: type: trtllm - prefill_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP decode_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n trtllm_config: prefill: max_batch_size: 2 @@ -60,7 +51,6 @@ backend: cache_transceiver_config: max_tokens_in_buffer: 8192 backend: UCX - decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -90,23 +80,14 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 - - benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: ['30'] - req_rate: "inf" - + concurrencies: '30' + req_rate: inf frontend: - nginx_container: "nginx-sqsh" - type: "dynamo" - - -health_check: - max_attempts: 360 - interval_seconds: 10 - + type: dynamo + enable_multiple_frontends: false dynamo: - install: false \ No newline at end of file + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml new file mode 100644 index 00000000..134aa346 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: ctx2dep16_gen1dep16_batch256_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 2 + prefill_nodes: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '4916' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false From 1fd335e194fbe2b5b67882df8383c01e78a476ac Mon Sep 17 00:00:00 2001 From: Nathaniel Levin Date: Thu, 5 Feb 2026 05:26:21 +0000 Subject: [PATCH 2/3] Update H100 FP8 configs with verified 29 Pareto-optimal points Replace previous configs with verified Pareto-optimal configurations: - 1k1k MTP: 9 configs (conc: 6, 9, 30, 60, 117, 231, 462, 615, 1229) - 1k1k STP: 9 configs (conc: 6, 9, 30, 60, 231, 462, 924, 1845, 4916) - 8k1k MTP: 6 configs (conc: 6, 9, 30, 77, 78, 154) - 8k1k STP: 5 configs (conc: 6, 9, 30, 154, 308) Standardize container to nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 --- .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml | 4 +- .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml | 4 +- .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml | 4 +- .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml | 114 ++++++++++++++++++ .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml | 4 +- .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 4 +- .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 4 +- .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml | 4 +- .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 4 +- .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml | 2 +- .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml | 2 +- .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml | 2 +- .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml | 2 +- .../ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml} | 42 +++---- .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml | 103 ++++++++++++++++ .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 99 +++++++++++++++ .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 99 +++++++++++++++ .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 100 +++++++++++++++ .../ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml | 102 ++++++++++++++++ .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml} | 75 +++++------- .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 94 +++++++++++++++ .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 104 ++++++++++++++++ .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 104 ++++++++++++++++ .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml | 97 +++++++++++++++ 24 files changed, 1087 insertions(+), 86 deletions(-) create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml rename recipes/trtllm/h100-fp8/{1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml => 8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml} (79%) create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml rename recipes/trtllm/h100-fp8/8k1k/{mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml => stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml} (67%) create mode 100644 recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml create mode 100644 recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml index aa34802b..4231b4b5 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml @@ -1,4 +1,4 @@ -name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2 +name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2_chunked_false model: path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" @@ -44,7 +44,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml index 12a1004e..33fd5257 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml @@ -1,4 +1,4 @@ -name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1 +name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1_chunked_false model: path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" @@ -44,7 +44,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml index 51ff2cfa..7a255e7a 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml @@ -1,4 +1,4 @@ -name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3 +name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" @@ -44,7 +44,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml new file mode 100644 index 00000000..39433c99 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml @@ -0,0 +1,114 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch128_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '462' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml index af783663..d400cc19 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml @@ -1,4 +1,4 @@ -name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3 +name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" @@ -41,7 +41,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml index c367a730..b7f2a69e 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -1,4 +1,4 @@ -name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3 +name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" @@ -41,7 +41,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml index 1a7b8833..e0d24147 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -1,4 +1,4 @@ -name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3 +name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" @@ -41,7 +41,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml index 4bf6a5f2..22084138 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml @@ -1,4 +1,4 @@ -name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3 +name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" @@ -41,7 +41,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml index 70600e72..488329d7 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -1,4 +1,4 @@ -name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3 +name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" @@ -41,7 +41,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml index 774db1e8..7ad2d283 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml @@ -44,7 +44,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml index fd63b7a1..7f3fcd63 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml @@ -44,7 +44,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml index bcf511b9..cd740057 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml @@ -44,7 +44,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml index b7f98f34..4601d76b 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml @@ -41,7 +41,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml similarity index 79% rename from recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml rename to recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml index 3c729e60..48c1dec3 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml @@ -1,4 +1,4 @@ -name: h100_1k1k_ctx1dep16_gen2dep16_batch16_eplb0_mtp2 +name: h100_8k1k_ctx1dep16_gen1dep16_batch4_eplb0_mtp3 model: path: DeepSeek-R1-0528 container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" @@ -7,13 +7,13 @@ resources: gpu_type: h100 prefill_workers: 1 prefill_nodes: 2 - decode_workers: 2 - decode_nodes: 4 + decode_workers: 1 + decode_nodes: 2 gpus_per_node: 8 backend: type: trtllm prefill_environment: - UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + UCX_CUDA_IPC_ENABLE_MNNVL: n TRTLLM_ENABLE_PDL: '1' TRTLLM_SERVER_DISABLE_GC: '1' TRTLLM_WORKER_DISABLE_GC: '1' @@ -21,9 +21,9 @@ backend: TLLM_LOG_LEVEL: INFO TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP - UCX_CUDA_IPC_ENABLE_MNNVL: n decode_environment: - UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n TRTLLM_ENABLE_PDL: '1' TRTLLM_SERVER_DISABLE_GC: '1' TRTLLM_WORKER_DISABLE_GC: '1' @@ -31,12 +31,11 @@ backend: TLLM_LOG_LEVEL: INFO TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP - UCX_CUDA_IPC_ENABLE_MNNVL: n trtllm_config: prefill: - max_batch_size: 2 - max_num_tokens: 2048 - max_seq_len: 2048 + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 tensor_parallel_size: 16 moe_expert_parallel_size: 16 enable_attention_dp: true @@ -47,33 +46,32 @@ backend: enable_chunked_prefill: true moe_config: backend: WIDEEP + max_num_tokens: 16384 kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.6 + free_gpu_memory_fraction: 0.3 dtype: fp8 cache_transceiver_config: - max_tokens_in_buffer: 8192 backend: UCX + max_tokens_in_buffer: 8256 speculative_config: decoding_type: MTP - num_nextn_predict_layers: 2 + num_nextn_predict_layers: 3 decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 enable_attention_dp: true enable_lm_head_tp_in_adp: true pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 256 - max_seq_len: 2088 + max_batch_size: 4 + max_num_tokens: 128 + max_seq_len: 9256 cuda_graph_config: enable_padding: true batch_sizes: - 1 - 2 - 4 - - 8 - - 16 print_iter_log: true kv_cache_config: enable_block_reuse: false @@ -83,18 +81,18 @@ backend: backend: WIDEEP use_low_precision_moe_combine: true cache_transceiver_config: - max_tokens_in_buffer: 8192 + max_tokens_in_buffer: 8256 backend: UCX stream_interval: 100 num_postprocess_workers: 4 speculative_config: decoding_type: MTP - num_nextn_predict_layers: 2 + num_nextn_predict_layers: 3 benchmark: type: sa-bench - isl: 1024 + isl: 8192 osl: 1024 - concurrencies: '616' + concurrencies: '77' req_rate: inf frontend: type: dynamo diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml new file mode 100644 index 00000000..d66ed765 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml @@ -0,0 +1,103 @@ +name: h100_8k1k_ctx1dep16_gen2tep16_batch32_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 2 + decode_nodes: 4 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '78' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml new file mode 100644 index 00000000..67128064 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -0,0 +1,99 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml new file mode 100644 index 00000000..8c18bbfc --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -0,0 +1,99 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '9' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..b1f5f926 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,100 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '30' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..19419bec --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,102 @@ +name: h100_8k1k_ctx2dep16_gen1dep16_batch8_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 2 + prefill_nodes: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '154' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml similarity index 67% rename from recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml rename to recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml index 507d8f72..21d6db8c 100644 --- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml @@ -1,42 +1,41 @@ -name: ctx1_gen2_dep16_batch16_eplb0_mtp3 + + +name: "h100_8k1k_ctx1dep16_gen2tep16_batch64_eplb0_mtp0" model: - path: "dsr1-fp8" + path: "DeepSeek-R1-0528" container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" precision: "fp8" resources: gpu_type: "h100" - prefill_nodes: 2 prefill_workers: 1 - - decode_nodes: 2 - decode_workers: 1 - + prefill_nodes: 2 + decode_workers: 2 + decode_nodes: 4 gpus_per_node: 8 backend: type: trtllm prefill_environment: - TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + TRTLLM_ENABLE_PDL: "1" TRTLLM_SERVER_DISABLE_GC: "1" TRTLLM_WORKER_DISABLE_GC: "1" NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1" + TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP" decode_environment: - TLLM_LOG_LEVEL: "INFO" + NCCL_NVLS_ENABLE: "0" + TRTLLM_ENABLE_PDL: "1" TRTLLM_SERVER_DISABLE_GC: "1" TRTLLM_WORKER_DISABLE_GC: "1" NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" + TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" trtllm_config: prefill: @@ -50,68 +49,56 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true + enable_chunked_prefill: false moe_config: backend: WIDEEP max_num_tokens: 16384 - kv_cache_config: + kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.3 dtype: fp8 cache_transceiver_config: - max_tokens_in_buffer: 8256 backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 + max_tokens_in_buffer: 8256 + decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 - enable_attention_dp: true - enable_lm_head_tp_in_adp: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 128 + max_batch_size: 64 + max_num_tokens: 256 max_seq_len: 9256 cuda_graph_config: enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 + batch_sizes: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64] print_iter_log: true - kv_cache_config: + kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.9 dtype: fp8 - moe_config: - backend: WIDEEP + moe_config: + backend: CUTLASS use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 8256 backend: UCX stream_interval: 100 num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: ['77'] + concurrencies: "154" req_rate: "inf" frontend: - nginx_container: "nginx-sqsh" type: "dynamo" - - -health_check: - max_attempts: 360 - interval_seconds: 10 + enable_multiple_frontends: false # There are errors about colliding on port 8080, and others. dynamo: - install: false \ No newline at end of file + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml new file mode 100644 index 00000000..0b2579c3 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -0,0 +1,94 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml new file mode 100644 index 00000000..f3b27160 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -0,0 +1,104 @@ + + +name: "h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp0" + +model: + path: "DeepSeek-R1-0528" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1" + TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP" + + decode_environment: + NCCL_NVLS_ENABLE: "0" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4] + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "9" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # There are errors about colliding on port 8080, and others. + +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml new file mode 100644 index 00000000..336d9b7f --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -0,0 +1,104 @@ + + +name: "h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp0" + +model: + path: "DeepSeek-R1-0528" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1" + TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP" + + decode_environment: + NCCL_NVLS_ENABLE: "0" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8] + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "30" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # There are errors about colliding on port 8080, and others. + +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..0713169a --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,97 @@ +name: h100_8k1k_ctx2dep16_gen1dep16_batch16_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 2 + prefill_nodes: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '308' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false From f715eb8552bd6823ec744b1cbb8685e54daedee5 Mon Sep 17 00:00:00 2001 From: Nathaniel Levin Date: Thu, 5 Feb 2026 22:56:35 +0000 Subject: [PATCH 3/3] Update H100 configs to tensorrtllm-runtime:0.8.1.post3 Update all 29 H100 FP8 config files to use the new container: - nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 --- .../h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml | 2 +- .../h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml | 2 +- .../h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml | 2 +- .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml | 2 +- .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml | 2 +- .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 2 +- .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 2 +- .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml | 2 +- .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 2 +- .../h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml | 2 +- .../h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml | 2 +- .../h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml | 2 +- .../h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml | 2 +- .../h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml | 2 +- .../h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 2 +- .../h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 2 +- .../h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 2 +- .../h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml | 2 +- .../h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml | 2 +- .../h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml | 2 +- .../h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 2 +- .../h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 2 +- .../h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 2 +- .../h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml | 2 +- .../h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml | 2 +- .../h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 2 +- .../h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 2 +- .../h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 2 +- .../h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml | 2 +- 29 files changed, 29 insertions(+), 29 deletions(-) diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml index 4231b4b5..104f3b4a 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml @@ -1,7 +1,7 @@ name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2_chunked_false model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml index 33fd5257..4c41ec82 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml @@ -1,7 +1,7 @@ name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1_chunked_false model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml index 7a255e7a..c3dc1408 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml index 39433c99..8f3663c9 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_1k1k_ctx1dep16_gen3tep16_batch128_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml index d400cc19..bd77671a 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml index b7f2a69e..c1fccbc9 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml index e0d24147..15c71e8d 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml index 22084138..4f261058 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml index 488329d7..07de7a34 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3_chunked_false model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml index 2f2a57fd..4a55e5ed 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: ctx1dep16_gen3dep16_batch16_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml index 7ad2d283..2bedf4c2 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: ctx1dep16_gen3dep16_batch32_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml index 7f3fcd63..1ff9ace4 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: ctx1dep16_gen3dep16_batch4_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml index cd740057..215e8a6b 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: ctx1dep16_gen3dep16_batch8_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml index 4601d76b..4281abed 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: ctx1dep16_gen3tep16_batch16_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml index a0510f6e..a0e0005e 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: ctx1dep16_gen3tep16_batch1_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml index b46e49a8..6eee90d2 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: ctx1dep16_gen3tep16_batch2_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml index d83994ab..29e63431 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: ctx1dep16_gen3tep16_batch8_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml index 134aa346..bb02cdd0 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: ctx2dep16_gen1dep16_batch256_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml index 48c1dec3..b78cb01a 100644 --- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_8k1k_ctx1dep16_gen1dep16_batch4_eplb0_mtp3 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml index d66ed765..dd0ddda8 100644 --- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_8k1k_ctx1dep16_gen2tep16_batch32_eplb0_mtp3 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml index 67128064..2f0ef4e9 100644 --- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml index 8c18bbfc..be3fc74c 100644 --- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml index b1f5f926..6a710bbb 100644 --- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml index 19419bec..4d746af1 100644 --- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml @@ -1,7 +1,7 @@ name: h100_8k1k_ctx2dep16_gen1dep16_batch8_eplb0_mtp3 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml index 21d6db8c..2f630277 100644 --- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml @@ -4,7 +4,7 @@ name: "h100_8k1k_ctx1dep16_gen2tep16_batch64_eplb0_mtp0" model: path: "DeepSeek-R1-0528" - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: "fp8" resources: diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml index 0b2579c3..9081201b 100644 --- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100 diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml index f3b27160..938fd965 100644 --- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -4,7 +4,7 @@ name: "h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp0" model: path: "DeepSeek-R1-0528" - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: "fp8" resources: diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml index 336d9b7f..c1eb86c1 100644 --- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -4,7 +4,7 @@ name: "h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp0" model: path: "DeepSeek-R1-0528" - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: "fp8" resources: diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml index 0713169a..40c84770 100644 --- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml @@ -1,7 +1,7 @@ name: h100_8k1k_ctx2dep16_gen1dep16_batch16_eplb0_mtp0 model: path: DeepSeek-R1-0528 - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" precision: fp8 resources: gpu_type: h100