ishandhanani · ishandhanani · Feb 6, 2026 · Feb 4, 2026 · Feb 5, 2026 · Feb 5, 2026
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,105 @@
+name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '615'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
@@ -0,0 +1,109 @@
+name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '1229'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,101 @@
+name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '231'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false