diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
new file mode 100644
index 00000000..104f3b4a
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,105 @@
+name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '615'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
new file mode 100644
index 00000000..4c41ec82
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
@@ -0,0 +1,109 @@
+name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '1229'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
new file mode 100644
index 00000000..c3dc1408
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,101 @@
+name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '231'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
new file mode 100644
index 00000000..8f3663c9
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,114 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch128_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '462'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
similarity index 63%
rename from recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml
rename to recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
index b0ef1feb..bd77671a 100644
--- a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
@@ -1,43 +1,34 @@
-name: ctx1_gen2_dep16_batch16_eplb0_mtp3
-
+name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3_chunked_false
 model:
-  path: "dsr1-fp8"
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
-  precision: "fp8"
-
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
 resources:
-  gpu_type: "h100"
-  prefill_nodes: 2
+  gpu_type: h100
   prefill_workers: 1
-
-  decode_workers: 2
-  decode_nodes: 4
-
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
   gpus_per_node: 8
-
 backend:
   type: trtllm
-
   prefill_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
   decode_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
   trtllm_config:
     prefill:
       max_batch_size: 2
@@ -50,7 +41,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
-      enable_chunked_prefill: true
+      enable_chunked_prefill: false
       moe_config:
         backend: WIDEEP
       kv_cache_config:
@@ -63,12 +54,11 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
-      enable_attention_dp: true
-      enable_lm_head_tp_in_adp: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
       pipeline_parallel_size: 1
       max_batch_size: 16
       max_num_tokens: 256
@@ -87,7 +77,7 @@ backend:
         free_gpu_memory_fraction: 0.9
         dtype: fp8
       moe_config:
-        backend: WIDEEP
+        backend: CUTLASS
         use_low_precision_moe_combine: true
       cache_transceiver_config:
         max_tokens_in_buffer: 8192
@@ -97,23 +87,14 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: ['616']
-  req_rate: "inf"
-
+  concurrencies: '60'
+  req_rate: inf
 frontend:
-  nginx_container: "nginx-sqsh"
-  type: "dynamo"
-
-
-health_check:
-  max_attempts: 360
-  interval_seconds: 10
-
+  type: dynamo
+  enable_multiple_frontends: false
 dynamo:
-  install: false
\ No newline at end of file
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
new file mode 100644
index 00000000..c1fccbc9
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,98 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
new file mode 100644
index 00000000..15c71e8d
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -0,0 +1,98 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '9'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
new file mode 100644
index 00000000..4f261058
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,102 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '117'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..07de7a34
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,99 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '30'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..4a55e5ed
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,97 @@
+name: ctx1dep16_gen3dep16_batch16_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '924'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..2bedf4c2
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,99 @@
+name: ctx1dep16_gen3dep16_batch32_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '1845'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
new file mode 100644
index 00000000..1ff9ace4
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
@@ -0,0 +1,95 @@
+name: ctx1dep16_gen3dep16_batch4_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '231'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
new file mode 100644
index 00000000..215e8a6b
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
@@ -0,0 +1,96 @@
+name: ctx1dep16_gen3dep16_batch8_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '462'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..4281abed
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,94 @@
+name: ctx1dep16_gen3tep16_batch16_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '60'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
new file mode 100644
index 00000000..a0e0005e
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,92 @@
+name: ctx1dep16_gen3tep16_batch1_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
new file mode 100644
index 00000000..6eee90d2
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -0,0 +1,92 @@
+name: ctx1dep16_gen3tep16_batch2_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '9'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
index 5b85a6ff..29e63431 100644
--- a/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -1,43 +1,34 @@
-name: ctx1_gen3_tep16_batch8_eplb0_mtp0
-
+name: ctx1dep16_gen3tep16_batch8_eplb0_mtp0
 model:
-  path: "dsr1-fp8"
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
-  precision: "fp8"
-
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
 resources:
-  gpu_type: "h100"
-  prefill_nodes: 2
+  gpu_type: h100
   prefill_workers: 1
-
-  decode_workers: 2
+  prefill_nodes: 2
+  decode_workers: 3
   decode_nodes: 6
-
   gpus_per_node: 8
-
 backend:
   type: trtllm
-
   prefill_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
   decode_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
   trtllm_config:
     prefill:
       max_batch_size: 2
@@ -60,7 +51,6 @@ backend:
       cache_transceiver_config:
         max_tokens_in_buffer: 8192
         backend: UCX
-
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
@@ -90,23 +80,14 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
-
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: ['30']
-  req_rate: "inf"
-
+  concurrencies: '30'
+  req_rate: inf
 frontend:
-  nginx_container: "nginx-sqsh"
-  type: "dynamo"
-
-
-health_check:
-  max_attempts: 360
-  interval_seconds: 10
-
+  type: dynamo
+  enable_multiple_frontends: false
 dynamo:
-  install: false
\ No newline at end of file
+  install: false
diff --git a/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
new file mode 100644
index 00000000..bb02cdd0
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: ctx2dep16_gen1dep16_batch256_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 2
+  prefill_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '4916'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
similarity index 67%
rename from recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml
rename to recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
index 507d8f72..b78cb01a 100644
--- a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_dep16_batch16_eplb0_mtp3.yaml
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
@@ -1,43 +1,36 @@
-name: ctx1_gen2_dep16_batch16_eplb0_mtp3
-
+name: h100_8k1k_ctx1dep16_gen1dep16_batch4_eplb0_mtp3
 model:
-  path: "dsr1-fp8"
-  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
-  precision: "fp8"
-
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
 resources:
-  gpu_type: "h100"
-  prefill_nodes: 2
+  gpu_type: h100
   prefill_workers: 1
-
-  decode_nodes: 2
+  prefill_nodes: 2
   decode_workers: 1
-
+  decode_nodes: 2
   gpus_per_node: 8
-
 backend:
   type: trtllm
-
   prefill_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
   decode_environment:
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TRTLLM_ENABLE_PDL: "1"
-    UCX_RNDV_SCHEME: "put_zcopy"
-    UCX_MAX_RNDV_RAILS: "1"
-    UCX_MAX_RMA_RAILS: "1"
-
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
   trtllm_config:
     prefill:
       max_batch_size: 1
@@ -50,6 +43,7 @@ backend:
       print_iter_log: true
       cuda_graph_config: null
       disable_overlap_scheduler: true
+      enable_chunked_prefill: true
       moe_config:
         backend: WIDEEP
         max_num_tokens: 16384
@@ -58,12 +52,11 @@ backend:
         free_gpu_memory_fraction: 0.3
         dtype: fp8
       cache_transceiver_config:
-        max_tokens_in_buffer: 8256
         backend: UCX
+        max_tokens_in_buffer: 8256
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
@@ -95,23 +88,14 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
-  concurrencies: ['77']
-  req_rate: "inf"
-
+  concurrencies: '77'
+  req_rate: inf
 frontend:
-  nginx_container: "nginx-sqsh"
-  type: "dynamo"
-
-
-health_check:
-  max_attempts: 360
-  interval_seconds: 10
-
+  type: dynamo
+  enable_multiple_frontends: false
 dynamo:
-  install: false
\ No newline at end of file
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
new file mode 100644
index 00000000..dd0ddda8
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,103 @@
+name: h100_8k1k_ctx1dep16_gen2tep16_batch32_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '78'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
new file mode 100644
index 00000000..2f0ef4e9
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,99 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
new file mode 100644
index 00000000..be3fc74c
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -0,0 +1,99 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '9'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..6a710bbb
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,100 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '30'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..4d746af1
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,102 @@
+name: h100_8k1k_ctx2dep16_gen1dep16_batch8_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 2
+  prefill_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '154'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
new file mode 100644
index 00000000..2f630277
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,104 @@
+
+
+name: "h100_8k1k_ctx1dep16_gen2tep16_batch64_eplb0_mtp0"
+
+model:
+  path: "DeepSeek-R1-0528"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1"
+    TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP"
+
+  decode_environment:
+    NCCL_NVLS_ENABLE: "0"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64]
+      print_iter_log: true
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config: 
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "154"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # There are errors about colliding on port 8080, and others.
+
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
new file mode 100644
index 00000000..9081201b
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,94 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
new file mode 100644
index 00000000..938fd965
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -0,0 +1,104 @@
+
+
+name: "h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp0"
+
+model:
+  path: "DeepSeek-R1-0528"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1"
+    TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP"
+
+  decode_environment:
+    NCCL_NVLS_ENABLE: "0"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4]
+      print_iter_log: true
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config: 
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "9"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # There are errors about colliding on port 8080, and others.
+
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
new file mode 100644
index 00000000..c1eb86c1
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -0,0 +1,104 @@
+
+
+name: "h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp0"
+
+model:
+  path: "DeepSeek-R1-0528"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1"
+    TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP"
+
+  decode_environment:
+    NCCL_NVLS_ENABLE: "0"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8]
+      print_iter_log: true
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config: 
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "30"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # There are errors about colliding on port 8080, and others.
+
+dynamo:
+  install: false
diff --git a/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..40c84770
--- /dev/null
+++ b/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,97 @@
+name: h100_8k1k_ctx2dep16_gen1dep16_batch16_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 2
+  prefill_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '308'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false