From 8092ae42bf1ee9deba5f6071293e7737eb2fc867 Mon Sep 17 00:00:00 2001
From: Nathaniel Levin <nlevin@nvidia.com>
Date: Wed, 4 Feb 2026 04:03:26 +0000
Subject: [PATCH 1/3] Add 1k1k STP and MTP disagg H100 configs

---
 .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml   | 105 +++++++++++++++
 .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml   | 109 +++++++++++++++
 .../ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml   | 103 ++++++++++++++
 .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml    | 101 ++++++++++++++
 ...> ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml} |  81 +++++------
 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    |  98 ++++++++++++++
 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    |  98 ++++++++++++++
 .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml   | 102 ++++++++++++++
 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    |  99 ++++++++++++++
 .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml   |  97 +++++++++++++
 .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml   |  99 ++++++++++++++
 .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml    |  95 +++++++++++++
 .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml    |  96 +++++++++++++
 .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml   |  94 +++++++++++++
 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    |  92 +++++++++++++
 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    |  92 +++++++++++++
 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    |  73 ++++------
 .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml  | 127 ++++++++++++++++++
 18 files changed, 1665 insertions(+), 96 deletions(-)
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
 rename recipes/trtllm/h100-fp8/1k1k/mtp/{ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml => ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml} (65%)
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml

diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
new file mode 100644
index 00000000..aa34802b
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,105 @@
+name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '615'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
new file mode 100644
index 00000000..12a1004e
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
@@ -0,0 +1,109 @@
+name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '1229'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml
new file mode 100644
index 00000000..3c729e60
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml
@@ -0,0 +1,103 @@
+name: h100_1k1k_ctx1dep16_gen2dep16_batch16_eplb0_mtp2
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '616'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
new file mode 100644
index 00000000..51ff2cfa
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,101 @@
+name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '231'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
similarity index 65%
rename from recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml
rename to recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
index b0ef1feb..af783663 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
@@ -1,43 +1,34 @@
-name: ctx1_gen2_dep16_batch16_eplb0_mtp3
-
+name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3
 model:
-  path: "dsr1-fp8"
+  path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
-  precision: "fp8"
-
+  precision: fp8
 resources:
-  gpu_type: "h100"
-  prefill_nodes: 2
+  gpu_type: h100
   prefill_workers: 1
-
-  decode_workers: 2
-  decode_nodes: 4
-
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
   gpus_per_node: 8
-
 backend:
   type: trtllm
-
   prefill_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
   decode_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
   trtllm_config:
     prefill:
       max_batch_size: 2
@@ -63,12 +54,11 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
-      enable_attention_dp: true
-      enable_lm_head_tp_in_adp: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
       pipeline_parallel_size: 1
       max_batch_size: 16
       max_num_tokens: 256
@@ -87,7 +77,7 @@ backend:
         free_gpu_memory_fraction: 0.9
         dtype: fp8
       moe_config:
-        backend: WIDEEP
+        backend: CUTLASS
         use_low_precision_moe_combine: true
       cache_transceiver_config:
         max_tokens_in_buffer: 8192
@@ -97,23 +87,14 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: ['616']
-  req_rate: "inf"
-
+  concurrencies: '60'
+  req_rate: inf
 frontend:
-  nginx_container: "nginx-sqsh"
-  type: "dynamo"
-
-
-health_check:
-  max_attempts: 360
-  interval_seconds: 10
-
+  type: dynamo
+  enable_multiple_frontends: false
 dynamo:
-  install: false
\ No newline at end of file
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
new file mode 100644
index 00000000..c367a730
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,98 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
new file mode 100644
index 00000000..1a7b8833
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -0,0 +1,98 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '9'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
new file mode 100644
index 00000000..4bf6a5f2
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,102 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '117'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..70600e72
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,99 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '30'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..2f2a57fd
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,97 @@
+name: ctx1dep16_gen3dep16_batch16_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '924'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..774db1e8
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,99 @@
+name: ctx1dep16_gen3dep16_batch32_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '1845'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
new file mode 100644
index 00000000..fd63b7a1
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
@@ -0,0 +1,95 @@
+name: ctx1dep16_gen3dep16_batch4_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '231'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
new file mode 100644
index 00000000..bcf511b9
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
@@ -0,0 +1,96 @@
+name: ctx1dep16_gen3dep16_batch8_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '462'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..b7f98f34
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,94 @@
+name: ctx1dep16_gen3tep16_batch16_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '60'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
new file mode 100644
index 00000000..a0510f6e
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,92 @@
+name: ctx1dep16_gen3tep16_batch1_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
new file mode 100644
index 00000000..b46e49a8
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -0,0 +1,92 @@
+name: ctx1dep16_gen3tep16_batch2_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '9'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
index 5b85a6ff..d83994ab 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -1,43 +1,34 @@
-name: ctx1_gen3_tep16_batch8_eplb0_mtp0
-
+name: ctx1dep16_gen3tep16_batch8_eplb0_mtp0
 model:
-  path: "dsr1-fp8"
+  path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
-  precision: "fp8"
-
+  precision: fp8
 resources:
-  gpu_type: "h100"
-  prefill_nodes: 2
+  gpu_type: h100
   prefill_workers: 1
-
-  decode_workers: 2
+  prefill_nodes: 2
+  decode_workers: 3
   decode_nodes: 6
-
   gpus_per_node: 8
-
 backend:
   type: trtllm
-
   prefill_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
   decode_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
   trtllm_config:
     prefill:
       max_batch_size: 2
@@ -60,7 +51,6 @@ backend:
       cache_transceiver_config:
         max_tokens_in_buffer: 8192
         backend: UCX
-
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
@@ -90,23 +80,14 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
-
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: ['30']
-  req_rate: "inf"
-
+  concurrencies: '30'
+  req_rate: inf
 frontend:
-  nginx_container: "nginx-sqsh"
-  type: "dynamo"
-
-
-health_check:
-  max_attempts: 360
-  interval_seconds: 10
-
+  type: dynamo
+  enable_multiple_frontends: false
 dynamo:
-  install: false
\ No newline at end of file
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
new file mode 100644
index 00000000..134aa346
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: ctx2dep16_gen1dep16_batch256_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 2
+  prefill_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '4916'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false

From 1fd335e194fbe2b5b67882df8383c01e78a476ac Mon Sep 17 00:00:00 2001
From: Nathaniel Levin <nlevin@nvidia.com>
Date: Thu, 5 Feb 2026 05:26:21 +0000
Subject: [PATCH 2/3] Update H100 FP8 configs with verified 29 Pareto-optimal
 points

Replace previous configs with verified Pareto-optimal configurations:
- 1k1k MTP: 9 configs (conc: 6, 9, 30, 60, 117, 231, 462, 615, 1229)
- 1k1k STP: 9 configs (conc: 6, 9, 30, 60, 231, 462, 924, 1845, 4916)
- 8k1k MTP: 6 configs (conc: 6, 9, 30, 77, 78, 154)
- 8k1k STP: 5 configs (conc: 6, 9, 30, 154, 308)

Standardize container to nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1
---
 .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml   |   4 +-
 .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml   |   4 +-
 .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml    |   4 +-
 .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml  | 114 ++++++++++++++++++
 .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml   |   4 +-
 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    |   4 +-
 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    |   4 +-
 .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml   |   4 +-
 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    |   4 +-
 .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml   |   2 +-
 .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml    |   2 +-
 .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml    |   2 +-
 .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml   |   2 +-
 .../ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml}   |  42 +++----
 .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml   | 103 ++++++++++++++++
 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    |  99 +++++++++++++++
 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    |  99 +++++++++++++++
 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    | 100 +++++++++++++++
 .../ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml    | 102 ++++++++++++++++
 .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml}  |  75 +++++-------
 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    |  94 +++++++++++++++
 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    | 104 ++++++++++++++++
 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    | 104 ++++++++++++++++
 .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml   |  97 +++++++++++++++
 24 files changed, 1087 insertions(+), 86 deletions(-)
 create mode 100644 recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
 rename recipes/trtllm/h100-fp8/{1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml => 8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml} (79%)
 create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
 create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
 create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
 create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
 create mode 100644 recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
 rename recipes/trtllm/h100-fp8/8k1k/{mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml => stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml} (67%)
 create mode 100644 recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
 create mode 100644 recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml

diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
index aa34802b..4231b4b5 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
@@ -1,4 +1,4 @@
-name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2
+name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2_chunked_false
 model:
   path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
@@ -44,7 +44,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
index 12a1004e..33fd5257 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
@@ -1,4 +1,4 @@
-name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1
+name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1_chunked_false
 model:
   path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
@@ -44,7 +44,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
index 51ff2cfa..7a255e7a 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
@@ -1,4 +1,4 @@
-name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3
+name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
@@ -44,7 +44,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
new file mode 100644
index 00000000..39433c99
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,114 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch128_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '462'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
index af783663..d400cc19 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
@@ -1,4 +1,4 @@
-name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3
+name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
@@ -41,7 +41,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
index c367a730..b7f2a69e 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -1,4 +1,4 @@
-name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3
+name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
@@ -41,7 +41,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
index 1a7b8833..e0d24147 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -1,4 +1,4 @@
-name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3
+name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
@@ -41,7 +41,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
index 4bf6a5f2..22084138 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
@@ -1,4 +1,4 @@
-name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3
+name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
@@ -41,7 +41,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
index 70600e72..488329d7 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -1,4 +1,4 @@
-name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3
+name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
@@ -41,7 +41,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
index 774db1e8..7ad2d283 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
@@ -44,7 +44,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
index fd63b7a1..7f3fcd63 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
@@ -44,7 +44,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
index bcf511b9..cd740057 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
@@ -44,7 +44,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
index b7f98f34..4601d76b 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
@@ -41,7 +41,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
similarity index 79%
rename from recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml
rename to recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
index 3c729e60..48c1dec3 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp2.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
@@ -1,4 +1,4 @@
-name: h100_1k1k_ctx1dep16_gen2dep16_batch16_eplb0_mtp2
+name: h100_8k1k_ctx1dep16_gen1dep16_batch4_eplb0_mtp3
 model:
   path: DeepSeek-R1-0528
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
@@ -7,13 +7,13 @@ resources:
   gpu_type: h100
   prefill_workers: 1
   prefill_nodes: 2
-  decode_workers: 2
-  decode_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
   gpus_per_node: 8
 backend:
   type: trtllm
   prefill_environment:
-    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
     TRTLLM_ENABLE_PDL: '1'
     TRTLLM_SERVER_DISABLE_GC: '1'
     TRTLLM_WORKER_DISABLE_GC: '1'
@@ -21,9 +21,9 @@ backend:
     TLLM_LOG_LEVEL: INFO
     TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
     TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
-    UCX_CUDA_IPC_ENABLE_MNNVL: n
   decode_environment:
-    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
     TRTLLM_ENABLE_PDL: '1'
     TRTLLM_SERVER_DISABLE_GC: '1'
     TRTLLM_WORKER_DISABLE_GC: '1'
@@ -31,12 +31,11 @@ backend:
     TLLM_LOG_LEVEL: INFO
     TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
     TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
-    UCX_CUDA_IPC_ENABLE_MNNVL: n
   trtllm_config:
     prefill:
-      max_batch_size: 2
-      max_num_tokens: 2048
-      max_seq_len: 2048
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
       enable_attention_dp: true
@@ -47,33 +46,32 @@ backend:
       enable_chunked_prefill: true
       moe_config:
         backend: WIDEEP
+        max_num_tokens: 16384
       kv_cache_config:
         enable_block_reuse: false
-        free_gpu_memory_fraction: 0.6
+        free_gpu_memory_fraction: 0.3
         dtype: fp8
       cache_transceiver_config:
-        max_tokens_in_buffer: 8192
         backend: UCX
+        max_tokens_in_buffer: 8256
       speculative_config:
         decoding_type: MTP
-        num_nextn_predict_layers: 2
+        num_nextn_predict_layers: 3
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
       enable_attention_dp: true
       enable_lm_head_tp_in_adp: true
       pipeline_parallel_size: 1
-      max_batch_size: 16
-      max_num_tokens: 256
-      max_seq_len: 2088
+      max_batch_size: 4
+      max_num_tokens: 128
+      max_seq_len: 9256
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
         - 1
         - 2
         - 4
-        - 8
-        - 16
       print_iter_log: true
       kv_cache_config:
         enable_block_reuse: false
@@ -83,18 +81,18 @@ backend:
         backend: WIDEEP
         use_low_precision_moe_combine: true
       cache_transceiver_config:
-        max_tokens_in_buffer: 8192
+        max_tokens_in_buffer: 8256
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
       speculative_config:
         decoding_type: MTP
-        num_nextn_predict_layers: 2
+        num_nextn_predict_layers: 3
 benchmark:
   type: sa-bench
-  isl: 1024
+  isl: 8192
   osl: 1024
-  concurrencies: '616'
+  concurrencies: '77'
   req_rate: inf
 frontend:
   type: dynamo
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
new file mode 100644
index 00000000..d66ed765
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,103 @@
+name: h100_8k1k_ctx1dep16_gen2tep16_batch32_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '78'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
new file mode 100644
index 00000000..67128064
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,99 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
new file mode 100644
index 00000000..8c18bbfc
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -0,0 +1,99 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '9'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..b1f5f926
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,100 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '30'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..19419bec
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,102 @@
+name: h100_8k1k_ctx2dep16_gen1dep16_batch8_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 2
+  prefill_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '154'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
similarity index 67%
rename from recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml
rename to recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
index 507d8f72..21d6db8c 100644
--- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
@@ -1,42 +1,41 @@
-name: ctx1_gen2_dep16_batch16_eplb0_mtp3
+
+
+name: "h100_8k1k_ctx1dep16_gen2tep16_batch64_eplb0_mtp0"
 
 model:
-  path: "dsr1-fp8"
+  path: "DeepSeek-R1-0528"
   container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
   precision: "fp8"
 
 resources:
   gpu_type: "h100"
-  prefill_nodes: 2
   prefill_workers: 1
-
-  decode_nodes: 2
-  decode_workers: 1
-
+  prefill_nodes: 2
+  decode_workers: 2
+  decode_nodes: 4
   gpus_per_node: 8
 
 backend:
   type: trtllm
 
   prefill_environment:
-    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    TRTLLM_ENABLE_PDL: "1"
     TRTLLM_SERVER_DISABLE_GC: "1"
     TRTLLM_WORKER_DISABLE_GC: "1"
     NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1"
+    TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP"
 
   decode_environment:
-    TLLM_LOG_LEVEL: "INFO"
+    NCCL_NVLS_ENABLE: "0"
+    TRTLLM_ENABLE_PDL: "1"
     TRTLLM_SERVER_DISABLE_GC: "1"
     TRTLLM_WORKER_DISABLE_GC: "1"
     NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
 
   trtllm_config:
     prefill:
@@ -50,68 +49,56 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
         max_num_tokens: 16384
-      kv_cache_config:
+      kv_cache_config: 
         enable_block_reuse: false
         free_gpu_memory_fraction: 0.3
         dtype: fp8
       cache_transceiver_config:
-        max_tokens_in_buffer: 8256
         backend: UCX
-      speculative_config:
-        decoding_type: MTP
-        num_nextn_predict_layers: 3
+        max_tokens_in_buffer: 8256
+
 
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
-      enable_attention_dp: true
-      enable_lm_head_tp_in_adp: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
       pipeline_parallel_size: 1
-      max_batch_size: 4
-      max_num_tokens: 128
+      max_batch_size: 64
+      max_num_tokens: 256
       max_seq_len: 9256
       cuda_graph_config:
         enable_padding: true
-        batch_sizes:
-        - 1
-        - 2
-        - 4
+        batch_sizes: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64]
       print_iter_log: true
-      kv_cache_config:
+      kv_cache_config: 
         enable_block_reuse: false
         free_gpu_memory_fraction: 0.9
         dtype: fp8
-      moe_config:
-        backend: WIDEEP
+      moe_config: 
+        backend: CUTLASS
         use_low_precision_moe_combine: true
       cache_transceiver_config:
         max_tokens_in_buffer: 8256
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
-      speculative_config:
-        decoding_type: MTP
-        num_nextn_predict_layers: 3
 
 
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: ['77']
+  concurrencies: "154"
   req_rate: "inf"
 
 frontend:
-  nginx_container: "nginx-sqsh"
   type: "dynamo"
-
-
-health_check:
-  max_attempts: 360
-  interval_seconds: 10
+  enable_multiple_frontends: false # There are errors about colliding on port 8080, and others.
 
 dynamo:
-  install: false
\ No newline at end of file
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
new file mode 100644
index 00000000..0b2579c3
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,94 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
new file mode 100644
index 00000000..f3b27160
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -0,0 +1,104 @@
+
+
+name: "h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp0"
+
+model:
+  path: "DeepSeek-R1-0528"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1"
+    TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP"
+
+  decode_environment:
+    NCCL_NVLS_ENABLE: "0"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4]
+      print_iter_log: true
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config: 
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "9"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # There are errors about colliding on port 8080, and others.
+
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
new file mode 100644
index 00000000..336d9b7f
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -0,0 +1,104 @@
+
+
+name: "h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp0"
+
+model:
+  path: "DeepSeek-R1-0528"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1"
+    TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP"
+
+  decode_environment:
+    NCCL_NVLS_ENABLE: "0"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8]
+      print_iter_log: true
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config: 
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "30"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # There are errors about colliding on port 8080, and others.
+
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..0713169a
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,97 @@
+name: h100_8k1k_ctx2dep16_gen1dep16_batch16_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 2
+  prefill_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '308'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false

From f715eb8552bd6823ec744b1cbb8685e54daedee5 Mon Sep 17 00:00:00 2001
From: Nathaniel Levin <nlevin@nvidia.com>
Date: Thu, 5 Feb 2026 22:56:35 +0000
Subject: [PATCH 3/3] Update H100 configs to tensorrtllm-runtime:0.8.1.post3

Update all 29 H100 FP8 config files to use the new container:
- nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3
---
 .../h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml   | 2 +-
 .../h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml   | 2 +-
 .../h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml    | 2 +-
 .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml  | 2 +-
 .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml   | 2 +-
 .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    | 2 +-
 .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    | 2 +-
 .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml   | 2 +-
 .../h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    | 2 +-
 .../h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml   | 2 +-
 .../h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml   | 2 +-
 .../h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml    | 2 +-
 .../h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml    | 2 +-
 .../h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml   | 2 +-
 .../h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    | 2 +-
 .../h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    | 2 +-
 .../h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    | 2 +-
 .../h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml  | 2 +-
 .../h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml    | 2 +-
 .../h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml   | 2 +-
 .../h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    | 2 +-
 .../h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    | 2 +-
 .../h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    | 2 +-
 .../h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml    | 2 +-
 .../h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml   | 2 +-
 .../h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    | 2 +-
 .../h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    | 2 +-
 .../h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    | 2 +-
 .../h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml   | 2 +-
 29 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
index 4231b4b5..104f3b4a 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
@@ -1,7 +1,7 @@
 name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2_chunked_false
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
index 33fd5257..4c41ec82 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
@@ -1,7 +1,7 @@
 name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1_chunked_false
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
index 7a255e7a..c3dc1408 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
index 39433c99..8f3663c9 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_1k1k_ctx1dep16_gen3tep16_batch128_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
index d400cc19..bd77671a 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
index b7f2a69e..c1fccbc9 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
index e0d24147..15c71e8d 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
index 22084138..4f261058 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
index 488329d7..07de7a34 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3_chunked_false
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
index 2f2a57fd..4a55e5ed 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: ctx1dep16_gen3dep16_batch16_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
index 7ad2d283..2bedf4c2 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: ctx1dep16_gen3dep16_batch32_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
index 7f3fcd63..1ff9ace4 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: ctx1dep16_gen3dep16_batch4_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
index cd740057..215e8a6b 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: ctx1dep16_gen3dep16_batch8_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
index 4601d76b..4281abed 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: ctx1dep16_gen3tep16_batch16_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
index a0510f6e..a0e0005e 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: ctx1dep16_gen3tep16_batch1_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
index b46e49a8..6eee90d2 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: ctx1dep16_gen3tep16_batch2_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
index d83994ab..29e63431 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: ctx1dep16_gen3tep16_batch8_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
index 134aa346..bb02cdd0 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: ctx2dep16_gen1dep16_batch256_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
index 48c1dec3..b78cb01a 100644
--- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_8k1k_ctx1dep16_gen1dep16_batch4_eplb0_mtp3
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
index d66ed765..dd0ddda8 100644
--- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_8k1k_ctx1dep16_gen2tep16_batch32_eplb0_mtp3
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
index 67128064..2f0ef4e9 100644
--- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
index 8c18bbfc..be3fc74c 100644
--- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
index b1f5f926..6a710bbb 100644
--- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
index 19419bec..4d746af1 100644
--- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
@@ -1,7 +1,7 @@
 name: h100_8k1k_ctx2dep16_gen1dep16_batch8_eplb0_mtp3
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
index 21d6db8c..2f630277 100644
--- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
@@ -4,7 +4,7 @@ name: "h100_8k1k_ctx1dep16_gen2tep16_batch64_eplb0_mtp0"
 
 model:
   path: "DeepSeek-R1-0528"
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: "fp8"
 
 resources:
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
index 0b2579c3..9081201b 100644
--- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
index f3b27160..938fd965 100644
--- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -4,7 +4,7 @@ name: "h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp0"
 
 model:
   path: "DeepSeek-R1-0528"
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: "fp8"
 
 resources:
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
index 336d9b7f..c1eb86c1 100644
--- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -4,7 +4,7 @@ name: "h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp0"
 
 model:
   path: "DeepSeek-R1-0528"
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: "fp8"
 
 resources:
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
index 0713169a..40c84770 100644
--- a/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
@@ -1,7 +1,7 @@
 name: h100_8k1k_ctx2dep16_gen1dep16_batch16_eplb0_mtp0
 model:
   path: DeepSeek-R1-0528
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
   precision: fp8
 resources:
   gpu_type: h100