From 1a3b4171a4d0ae0e8cba12ddeda67b5d4cf5580c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 2 May 2026 22:01:18 -0700
Subject: [PATCH 1/4] update dsv4 trt fused mhc image

---
 .github/configs/nvidia-master.yaml          | 2 +-
 benchmarks/single_node/dsv4_fp4_b300_trt.sh | 7 ++-----
 perf-changelog.yaml                         | 7 +++++++
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 97665ca53..71c420f04 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2707,7 +2707,7 @@ dsv4-fp4-b300-vllm:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 dsv4-fp4-b300-trt:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-4999884
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:fix-mhc7168-eb20e9e
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
index 1356ecbac..754846912 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # DeepSeek-V4-Pro single-node TRTLLM recipe for B300. The configured image
-# already contains NVIDIA/TensorRT-LLM@feat/deepseek_v4; do not build TRTLLM at
+# already contains a TensorRT-LLM DeepSeek-V4 build; do not build TRTLLM at
 # runtime from this benchmark path.
 
 source "$(dirname "$0")/../benchmark_lib.sh"
@@ -101,10 +101,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
 fi
 
-# DeepSeek-V4-Pro has hidden size 7168. The current TRTLLM fused-HC MHC
-# path corrupts eval generations for this shape; keep eval servers on the
-# unfused path until the fused kernel is guarded or supports 7168.
-export TRTLLM_MHC_ENABLE_FUSED_HC=0
+export TRTLLM_MHC_ENABLE_FUSED_HC="${TRTLLM_MHC_ENABLE_FUSED_HC:-1}"
 echo "TRTLLM_MHC_ENABLE_FUSED_HC: $TRTLLM_MHC_ENABLE_FUSED_HC"
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4098a580a..df1a22cff 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2148,3 +2148,10 @@
     - "Disable TRTLLM fused MHC hyper-connection for eval servers via TRTLLM_MHC_ENABLE_FUSED_HC=0 because the current fused kernel corrupts DeepSeek-V4-Pro hidden size 7168 generations"
     - "Keep this as eval-only PR validation until the TensorRT-LLM fused MHC kernel is guarded or supports hidden size 7168"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233
+
+- config-keys:
+    - dsv4-fp4-b300-trt
+  description:
+    - "Update the TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:fix-mhc7168-eb20e9e"
+    - "Enable TRTLLM fused MHC by default now that the image includes the hidden-size 7168 fused-HC fix"
+  pr-link: TBD

From 1ce0e2294e2524dc87add60a4a0320ba3f6629f6 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 2 May 2026 22:08:28 -0700
Subject: [PATCH 2/4] Update perf-changelog.yaml

---
 perf-changelog.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index df1a22cff..a97ec6533 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2154,4 +2154,5 @@
   description:
     - "Update the TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:fix-mhc7168-eb20e9e"
     - "Enable TRTLLM fused MHC by default now that the image includes the hidden-size 7168 fused-HC fix"
-  pr-link: TBD
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1270
+  evals-only: true

From 660a3b787d84a69894f9657c23ab56aa3a437817 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 2 May 2026 22:37:12 -0700
Subject: [PATCH 3/4] all, again

---
 perf-changelog.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a97ec6533..5d02725da 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2155,4 +2155,3 @@
     - "Update the TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:fix-mhc7168-eb20e9e"
     - "Enable TRTLLM fused MHC by default now that the image includes the hidden-size 7168 fused-HC fix"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1270
-  evals-only: true

From 8804f7bff0f2461ba6b210ec2ee87f58d60ddfac Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 5 May 2026 13:28:36 -0700
Subject: [PATCH 4/4] merged PR

---
 .github/configs/nvidia-master.yaml | 2 +-
 perf-changelog.yaml                | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 71c420f04..9e0561607 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2707,7 +2707,7 @@ dsv4-fp4-b300-vllm:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 dsv4-fp4-b300-trt:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:fix-mhc7168-eb20e9e
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5d02725da..8fb77e84c 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2152,6 +2152,6 @@
 - config-keys:
     - dsv4-fp4-b300-trt
   description:
-    - "Update the TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:fix-mhc7168-eb20e9e"
-    - "Enable TRTLLM fused MHC by default now that the image includes the hidden-size 7168 fused-HC fix"
+    - "Update the TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715"
+    - "Enable TRTLLM fused MHC by default with the DeepSeek-V4 feature image"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1270