diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml new file mode 100644 index 00000000..104f3b4a --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml @@ -0,0 +1,105 @@ +name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '615' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml new file mode 100644 index 00000000..4c41ec82 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml @@ -0,0 +1,109 @@ +name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '1229' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml new file mode 100644 index 00000000..c3dc1408 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml @@ -0,0 +1,101 @@ +name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '231' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml new file mode 100644 index 00000000..8f3663c9 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml @@ -0,0 +1,114 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch128_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '462' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml similarity index 63% rename from recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml rename to recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml index b0ef1feb..bd77671a 100644 --- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml @@ -1,43 +1,34 @@ -name: ctx1_gen2_dep16_batch16_eplb0_mtp3 - +name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3_chunked_false model: - path: "dsr1-fp8" - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" - precision: "fp8" - + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 resources: - gpu_type: "h100" - prefill_nodes: 2 + gpu_type: h100 prefill_workers: 1 - - decode_workers: 2 - decode_nodes: 4 - + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 gpus_per_node: 8 - backend: type: trtllm - prefill_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP decode_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n trtllm_config: prefill: max_batch_size: 2 @@ -50,7 +41,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true - enable_chunked_prefill: true + enable_chunked_prefill: false moe_config: backend: WIDEEP kv_cache_config: @@ -63,12 +54,11 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 - enable_attention_dp: true - enable_lm_head_tp_in_adp: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false pipeline_parallel_size: 1 max_batch_size: 16 max_num_tokens: 256 @@ -87,7 +77,7 @@ backend: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTLASS use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 8192 @@ -97,23 +87,14 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - - benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: ['616'] - req_rate: "inf" - + concurrencies: '60' + req_rate: inf frontend: - nginx_container: "nginx-sqsh" - type: "dynamo" - - -health_check: - max_attempts: 360 - interval_seconds: 10 - + type: dynamo + enable_multiple_frontends: false dynamo: - install: false \ No newline at end of file + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml new file mode 100644 index 00000000..c1fccbc9 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -0,0 +1,98 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml new file mode 100644 index 00000000..15c71e8d --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -0,0 +1,98 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '9' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml new file mode 100644 index 00000000..4f261058 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml @@ -0,0 +1,102 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '117' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..07de7a34 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,99 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '30' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..4a55e5ed --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,97 @@ +name: ctx1dep16_gen3dep16_batch16_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '924' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..2bedf4c2 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml @@ -0,0 +1,99 @@ +name: ctx1dep16_gen3dep16_batch32_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '1845' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml new file mode 100644 index 00000000..1ff9ace4 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml @@ -0,0 +1,95 @@ +name: ctx1dep16_gen3dep16_batch4_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '231' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml new file mode 100644 index 00000000..215e8a6b --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml @@ -0,0 +1,96 @@ +name: ctx1dep16_gen3dep16_batch8_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '462' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..4281abed --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,94 @@ +name: ctx1dep16_gen3tep16_batch16_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '60' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml new file mode 100644 index 00000000..a0e0005e --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -0,0 +1,92 @@ +name: ctx1dep16_gen3tep16_batch1_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml new file mode 100644 index 00000000..6eee90d2 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -0,0 +1,92 @@ +name: ctx1dep16_gen3tep16_batch2_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '9' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml index 5b85a6ff..29e63431 100644 --- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -1,43 +1,34 @@ -name: ctx1_gen3_tep16_batch8_eplb0_mtp0 - +name: ctx1dep16_gen3tep16_batch8_eplb0_mtp0 model: - path: "dsr1-fp8" - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" - precision: "fp8" - + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 resources: - gpu_type: "h100" - prefill_nodes: 2 + gpu_type: h100 prefill_workers: 1 - - decode_workers: 2 + prefill_nodes: 2 + decode_workers: 3 decode_nodes: 6 - gpus_per_node: 8 - backend: type: trtllm - prefill_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP decode_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n trtllm_config: prefill: max_batch_size: 2 @@ -60,7 +51,6 @@ backend: cache_transceiver_config: max_tokens_in_buffer: 8192 backend: UCX - decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -90,23 +80,14 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 - - benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: ['30'] - req_rate: "inf" - + concurrencies: '30' + req_rate: inf frontend: - nginx_container: "nginx-sqsh" - type: "dynamo" - - -health_check: - max_attempts: 360 - interval_seconds: 10 - + type: dynamo + enable_multiple_frontends: false dynamo: - install: false \ No newline at end of file + install: false diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml new file mode 100644 index 00000000..bb02cdd0 --- /dev/null +++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: ctx2dep16_gen1dep16_batch256_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 2 + prefill_nodes: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '4916' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml similarity index 67% rename from recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml rename to recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml index 507d8f72..b78cb01a 100644 --- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml @@ -1,43 +1,36 @@ -name: ctx1_gen2_dep16_batch16_eplb0_mtp3 - +name: h100_8k1k_ctx1dep16_gen1dep16_batch4_eplb0_mtp3 model: - path: "dsr1-fp8" - container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" - precision: "fp8" - + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 resources: - gpu_type: "h100" - prefill_nodes: 2 + gpu_type: h100 prefill_workers: 1 - - decode_nodes: 2 + prefill_nodes: 2 decode_workers: 1 - + decode_nodes: 2 gpus_per_node: 8 - backend: type: trtllm - prefill_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP decode_environment: - TLLM_LOG_LEVEL: "INFO" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TRTLLM_ENABLE_PDL: "1" - UCX_RNDV_SCHEME: "put_zcopy" - UCX_MAX_RNDV_RAILS: "1" - UCX_MAX_RMA_RAILS: "1" - + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP trtllm_config: prefill: max_batch_size: 1 @@ -50,6 +43,7 @@ backend: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true + enable_chunked_prefill: true moe_config: backend: WIDEEP max_num_tokens: 16384 @@ -58,12 +52,11 @@ backend: free_gpu_memory_fraction: 0.3 dtype: fp8 cache_transceiver_config: - max_tokens_in_buffer: 8256 backend: UCX + max_tokens_in_buffer: 8256 speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -95,23 +88,14 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - - benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 - concurrencies: ['77'] - req_rate: "inf" - + concurrencies: '77' + req_rate: inf frontend: - nginx_container: "nginx-sqsh" - type: "dynamo" - - -health_check: - max_attempts: 360 - interval_seconds: 10 - + type: dynamo + enable_multiple_frontends: false dynamo: - install: false \ No newline at end of file + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml new file mode 100644 index 00000000..dd0ddda8 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml @@ -0,0 +1,103 @@ +name: h100_8k1k_ctx1dep16_gen2tep16_batch32_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 2 + decode_nodes: 4 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '78' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml new file mode 100644 index 00000000..2f0ef4e9 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -0,0 +1,99 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml new file mode 100644 index 00000000..be3fc74c --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -0,0 +1,99 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '9' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..6a710bbb --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,100 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '30' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..4d746af1 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,102 @@ +name: h100_8k1k_ctx2dep16_gen1dep16_batch8_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 2 + prefill_nodes: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '154' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml new file mode 100644 index 00000000..2f630277 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml @@ -0,0 +1,104 @@ + + +name: "h100_8k1k_ctx1dep16_gen2tep16_batch64_eplb0_mtp0" + +model: + path: "DeepSeek-R1-0528" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 2 + decode_nodes: 4 + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1" + TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP" + + decode_environment: + NCCL_NVLS_ENABLE: "0" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64] + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "154" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # There are errors about colliding on port 8080, and others. + +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml new file mode 100644 index 00000000..9081201b --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -0,0 +1,94 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml new file mode 100644 index 00000000..938fd965 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -0,0 +1,104 @@ + + +name: "h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp0" + +model: + path: "DeepSeek-R1-0528" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1" + TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP" + + decode_environment: + NCCL_NVLS_ENABLE: "0" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4] + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "9" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # There are errors about colliding on port 8080, and others. + +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml new file mode 100644 index 00000000..c1eb86c1 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -0,0 +1,104 @@ + + +name: "h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp0" + +model: + path: "DeepSeek-R1-0528" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1" + TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP" + + decode_environment: + NCCL_NVLS_ENABLE: "0" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8] + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "30" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # There are errors about colliding on port 8080, and others. + +dynamo: + install: false diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..40c84770 --- /dev/null +++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,97 @@ +name: h100_8k1k_ctx2dep16_gen1dep16_batch16_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 2 + prefill_nodes: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '308' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false