diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index c362604f1..3eb2d4449 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -338,7 +338,7 @@ kimik2.5-int4-mi325x-vllm:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.16.0
+  image: vllm/vllm-openai-rocm:v0.18.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x
@@ -350,14 +350,18 @@ kimik2.5-fp4-mi355x-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.18.0
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
index bb522b396..a8bd01442 100755
--- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
@@ -31,9 +31,29 @@ fi
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
-# do not enable aiter due to Aiter MLA not currently supporting num_heads=8
-# https://github.com/vllm-project/vllm/issues/35641
-# export VLLM_ROCM_USE_AITER=1
+# If the machine runs a MEC FW older than 177, RCCL
+# cannot reclaim some memory.
+# Disable that features to avoid crashes.
+# This is related to the changes in the driver at:
+# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
+if [[ "$version" == "" || $version -lt 177 ]]; then
+  export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+# Disable AITER RMSNorm for TP < 8 due to accuracy issues
+if [ "${TP}" -lt 8 ]; then
+  export VLLM_ROCM_USE_AITER_RMSNORM=0
+fi
+
+if [ "${EP_SIZE:-0}" -gt 1 ]; then
+  EP=" --enable-expert-parallel"
+else
+  EP=" "
+fi
 
 # following AMD andy luo's recipe
 # https://x.com/linluo77/status/2017024513595301985
@@ -44,10 +64,11 @@ start_gpu_monitor
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
---gpu-memory-utilization 0.95 \
+$EP \
+--gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
---block-size=64 \
---disable-log-requests \
+--block-size=1 \
+--no-enable-prefix-caching \
 --trust-remote-code \
 --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 03fb6e082..2648e746e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1068,3 +1068,12 @@
     - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943
 
+- config-keys:
+    - kimik2.5-fp4-mi355x-vllm
+  description:
+    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
+    - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)"
+    - "Add expert parallel, TP4, and TP4/EP4 search spaces"
+    - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936
+  
\ No newline at end of file