diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
deleted file mode 100644
index d64bac8763b..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.1
-disable_overlap_scheduler: True
-enable_autotuner: False
-context_servers:
-  num_instances: 2
-  router:
-    type: kv_cache_aware
-  max_batch_size: 16
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: False
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-      - "localhost:8002"
-generation_servers:
-  num_instances: 2
-  router:
-    type: kv_cache_aware
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: False
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.1
-  urls:
-      - "localhost:8003"
-      - "localhost:8004"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml
deleted file mode 100644
index fe15f70085c..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/bf16
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-enable_autotuner: False
-context_servers:
-  num_instances: 2
-  router:
-    type: kv_cache_aware
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.1
-  cache_transceiver_config:
-    backend: "DEFAULT"
-  urls:
-      - "localhost:8001"
-      - "localhost:8002"
-generation_servers:
-  num_instances: 2
-  router:
-    type: kv_cache_aware
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.1
-  cache_transceiver_config:
-    backend: "DEFAULT"
-  urls:
-      - "localhost:8003"
-      - "localhost:8004"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
deleted file mode 100644
index 26444b1ab23..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-enable_autotuner: False
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.15
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  router:
-    type: kv_cache_aware
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.05
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml
deleted file mode 100644
index 06a4c154b46..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/bf16
-free_gpu_memory_fraction: 0.15
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-enable_autotuner: False
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  router:
-    type: kv_cache_aware
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.05
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml
deleted file mode 100644
index 28816380fe4..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.15
-conditional_disagg_config:
-  max_local_prefill_length: 100
-disable_overlap_scheduler: True
-enable_autotuner: False
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.15
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  router:
-    type: kv_cache_aware
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.15
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml
deleted file mode 100644
index b7f34202724..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/bf16
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.15
-conditional_disagg_config:
-  max_local_prefill_length: 100
-disable_overlap_scheduler: True
-enable_autotuner: False
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.15
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  router:
-    type: kv_cache_aware
-  kv_cache_config:
-    enable_block_reuse: True
-    enable_partial_reuse: True
-    event_buffer_max_size: 1024
-    free_gpu_memory_fraction: 0.15
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml
deleted file mode 100644
index b7f03c0f9f5..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 2
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-    enable_block_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 2
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-    enable_block_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml
deleted file mode 100644
index 892b4e8b31f..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 2
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-    enable_block_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml
deleted file mode 100644
index 2c7a67e1cbf..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 4
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-    enable_block_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 4
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-    enable_block_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_gentp4.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_gentp4.yaml
deleted file mode 100644
index a1e4ad50a9c..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_gentp4.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 4
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 4
-  pipeline_parallel_size: 1
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
deleted file mode 100644
index 83f9b3a3e87..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.1
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
deleted file mode 100644
index 57eb4ea0041..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.1
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-speculative_config:
-  decoding_type: MTP
-  num_nextn_predict_layers: 1
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  enable_attention_dp: false
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
deleted file mode 100644
index 4343850c77f..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.1
-backend: "pytorch"
-cuda_graph_config: null
-speculative_config:
-  decoding_type: MTP
-  num_nextn_predict_layers: 1
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  disable_overlap_scheduler: False
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_ctxpp2_gentp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_ctxpp2_gentp2.yaml
deleted file mode 100644
index 4a61497e94e..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_ctxpp2_gentp2.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.1
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 2
-  enable_attention_dp: false
-  speculative_config:
-    decoding_type: MTP
-    num_nextn_predict_layers: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: false
-  speculative_config:
-    decoding_type: MTP
-    num_nextn_predict_layers: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml
deleted file mode 100644
index 837e5df8e33..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.1
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-speculative_config:
-  decoding_type: MTP
-  num_nextn_predict_layers: 2
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  enable_attention_dp: false
-  urls:
-      - "localhost:8002"
-  cache_transceiver_config:
-    backend: DEFAULT
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml
deleted file mode 100644
index ce53fd4626b..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 2
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
deleted file mode 100644
index 1335d63adfe..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 2
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
-      - "localhost:8003"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml
deleted file mode 100644
index fa5dffa518b..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-free_gpu_memory_fraction: 0.25
-backend: "trt"
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 2
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
-      - "localhost:8003"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
deleted file mode 100644
index 6b22665e9f1..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
deleted file mode 100644
index 80a1a3636a8..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
deleted file mode 100644
index 9dfb092151a..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: false
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
deleted file mode 100644
index 4b6bc571dab..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-speculative_config:
-  decoding_type: MTP
-  num_nextn_predict_layers: 1
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: false
-  cache_transceiver_config:
-    backend: DEFAULT
-
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
deleted file mode 100644
index 26218586f49..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: True
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: True
-  disable_overlap_scheduler: False
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
deleted file mode 100644
index 99034f8a1a3..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  cuda_graph_config:
-    enable_padding: False
-  disable_overlap_scheduler: False
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml
deleted file mode 100644
index 4cfe18ebaf6..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: "MPI"
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: "MPI"
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml
deleted file mode 100644
index 3b1aa8fc0e3..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: "NIXL"
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: "NIXL"
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
deleted file mode 100644
index 4c601fbb868..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cuda_graph_config:
-    enable_padding: False
-  disable_overlap_scheduler: False
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml
deleted file mode 100644
index d3395938cae..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-hostname: localhost
-port: 8000
-model: DeepSeek-V3-Lite/fp8
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: "UCX"
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: "UCX"
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml
deleted file mode 100644
index ce47009aaad..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 2
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-    enable_block_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 2
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-    enable_block_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
deleted file mode 100644
index 56db3df7697..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  cuda_graph_config:
-    batch_sizes: [1,3000]
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  cuda_graph_config:
-    enable_padding: True
-    batch_sizes: [1,4,8,16,24,32]
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml
deleted file mode 100644
index 26d1f6b6c15..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  max_num_tokens: 512
-  max_batch_size: 64
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  max_num_tokens: 256
-  max_batch_size: 32
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml
deleted file mode 100644
index 92b13837644..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-backend: "pytorch"
-cuda_graph_config: null
-context_servers:
-  num_instances: 0
-generation_servers:
-  num_instances: 2
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_block_reuse: False
-    enable_partial_reuse: False
-  cache_transceiver_config:
-    backend: DEFAULT
-  print_iter_log: True
-  urls:
-      - "localhost:8002"
-      - "localhost:8003"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml
deleted file mode 100644
index 19d1eca714f..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 2
-  pipeline_parallel_size: 1
-  enable_attention_dp: true
-  max_batch_size: 1
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml
deleted file mode 100644
index ad706f8bf1f..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-backend: "trt"
-context_servers:
-  num_instances: 0
-generation_servers:
-  num_instances: 2
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_block_reuse: False
-    enable_partial_reuse: False
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
-      - "localhost:8003"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
deleted file mode 100644
index f0593d9ef60..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.15
-context_servers:
-  num_instances: 2
-  router:
-    type: load_balancing
-    use_tokens: True
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.15
-    enable_partial_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-      - "localhost:8002"
-generation_servers:
-  num_instances: 2
-  router:
-    type: load_balancing
-    use_tokens: False
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.15
-    enable_partial_reuse: False
-  disable_overlap_scheduler: False
-  cache_transceiver_config:
-    backend: "DEFAULT"
-  urls:
-      - "localhost:8003"
-      - "localhost:8004"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_metrics.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_metrics.yaml
deleted file mode 100644
index 6d566aa4f99..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_metrics.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-perf_metrics_max_requests: 1000
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  return_perf_metrics: True
-  perf_metrics_max_requests: 1000
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  return_perf_metrics: True
-  perf_metrics_max_requests: 1000
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
deleted file mode 100644
index 27d7ec4ee82..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-free_gpu_memory_fraction: 0.25
-backend: "pytorch"
-cuda_graph_config: null
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 2
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml
deleted file mode 100644
index 4e3417c732a..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-free_gpu_memory_fraction: 0.1
-backend: pytorch
-disable_overlap_scheduler: True
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: "DEFAULT"
-  urls:
-    - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: "DEFAULT"
-  urls:
-    - "localhost:8002"
-  speculative_config:
-    decoding_type: NGram
-    max_draft_len: 4
-    max_matching_ngram_size: 4
-    is_keep_all: True
-    is_use_oldest: True
-    is_public_pool: True
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
deleted file mode 100644
index 55990bbaa62..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  disable_overlap_scheduler: True
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  disable_overlap_scheduler: False
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml
deleted file mode 100644
index 3eb275c87e0..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-hostname: localhost
-port: 8000
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-free_gpu_memory_fraction: 0.25
-backend: "trt"
-context_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  cache_transceiver_config:
-    backend: DEFAULT
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml
deleted file mode 100644
index 287d1103a4f..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-hostname: localhost
-port: 8000
-backend: "pytorch"
-cuda_graph_config: null
-free_gpu_memory_fraction: 0.2
-context_servers:
-  num_instances: 1
-  max_batch_size: 1
-  max_num_tokens: 3000
-  max_seq_len: 4096
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  sampler_type: "TRTLLMSampler"
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  cache_transceiver_config:
-    backend: "DEFAULT"
-  disable_overlap_scheduler: True
-  urls:
-      - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  max_batch_size: 256
-  max_num_tokens: 4096
-  max_seq_len: 4096
-  sampler_type: "TRTLLMSampler"
-  kv_cache_config:
-    free_gpu_memory_fraction: 0.2
-    enable_partial_reuse: False
-  cache_transceiver_config:
-    backend: "DEFAULT"
-  disable_overlap_scheduler: False
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
deleted file mode 100644
index 720da1acbdc..00000000000
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ /dev/null
@@ -1,1720 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import os
-import re
-import subprocess
-import tempfile
-from typing import Callable
-
-import pytest
-import yaml
-from defs.common import wait_for_server
-from defs.conftest import (get_sm_version, llm_models_root, skip_arm,
-                           skip_no_hopper)
-from defs.trt_test_alternative import check_call, check_output, popen
-
-from tensorrt_llm._utils import mpi_disabled
-from tensorrt_llm.logger import logger
-
-
-def cleanup_output_files():
-    """Clean up output files from previous runs."""
-    for file in ['output.json', 'output_streaming.json']:
-        try:
-            os.remove(file)
-        except FileNotFoundError:
-            pass
-
-
-def validate_timing_metrics(perf_metrics_item, request_context=""):
-    """
-    Helper function to validate timing metrics relationships.
-
-    Args:
-        perf_metrics_item: A single performance metrics item from the /perf_metrics endpoint
-        request_context: String context for error messages (e.g., "request 1", "streaming")
-    """
-    # Validate basic structure
-    required_keys = [
-        "ctx_server", "gen_server", "ctx_perf_metrics", "gen_perf_metrics",
-        "disagg_server_arrival_time", "disagg_server_first_token_time"
-    ]
-    for key in required_keys:
-        assert key in perf_metrics_item, f"Missing key: {key} in {request_context}"
-
-    assert perf_metrics_item["ctx_perf_metrics"][
-        "ctx_request_id"] == perf_metrics_item["gen_perf_metrics"][
-            "ctx_request_id"]
-
-    # Extract timing metrics
-    ctx_metrics = perf_metrics_item["ctx_perf_metrics"]["perf_metrics"][
-        "timing_metrics"]
-    gen_metrics = perf_metrics_item["gen_perf_metrics"]["perf_metrics"][
-        "timing_metrics"]
-    disagg_arrival = perf_metrics_item["disagg_server_arrival_time"]
-    disagg_first_token = perf_metrics_item["disagg_server_first_token_time"]
-
-    # Validate disaggregated server timing metrics
-    assert disagg_arrival is not None, f"disagg_server_arrival_time is None in {request_context}"
-    assert disagg_first_token is not None, f"disagg_server_first_token_time is None in {request_context}"
-    assert isinstance(
-        disagg_arrival,
-        (int, float
-         )), f"disagg_server_arrival_time is not numeric in {request_context}"
-    assert isinstance(
-        disagg_first_token, (int, float)
-    ), f"disagg_server_first_token_time is not numeric in {request_context}"
-    assert disagg_arrival > 0, f"disagg_server_arrival_time is not positive in {request_context}"
-    assert disagg_first_token > 0, f"disagg_server_first_token_time is not positive in {request_context}"
-    assert disagg_arrival <= disagg_first_token, f"disagg_server_arrival_time > disagg_server_first_token_time in {request_context}"
-
-    # Validate server-level timing metrics for context server
-    ctx_server_arrival = ctx_metrics.get("server_arrival_time")
-    ctx_server_first_token = ctx_metrics.get("server_first_token_time")
-    assert ctx_server_arrival is not None, f"ctx server_arrival_time is None in {request_context}"
-    assert ctx_server_first_token is not None, f"ctx server_first_token_time is None in {request_context}"
-    assert isinstance(
-        ctx_server_arrival,
-        (int,
-         float)), f"ctx server_arrival_time is not numeric in {request_context}"
-    assert isinstance(
-        ctx_server_first_token,
-        (int, float
-         )), f"ctx server_first_token_time is not numeric in {request_context}"
-    assert ctx_server_arrival <= ctx_server_first_token, f"ctx server_arrival_time > server_first_token_time in {request_context}"
-    assert ctx_metrics["last_token_time"] - ctx_server_first_token < 1e-3
-
-    # Validate server-level timing metrics for generation server
-    gen_server_arrival = gen_metrics.get("server_arrival_time")
-    gen_server_first_token = gen_metrics.get("server_first_token_time")
-    assert gen_server_arrival is not None, f"gen server_arrival_time is None in {request_context}"
-    assert gen_server_first_token is not None, f"gen server_first_token_time is None in {request_context}"
-    assert isinstance(
-        gen_server_arrival,
-        (int,
-         float)), f"gen server_arrival_time is not numeric in {request_context}"
-    assert isinstance(
-        gen_server_first_token,
-        (int, float
-         )), f"gen server_first_token_time is not numeric in {request_context}"
-    assert gen_server_arrival <= gen_server_first_token, f"gen server_arrival_time > server_first_token_time in {request_context}"
-
-    # Validate timing relationships between different levels
-    # Disaggregated server should receive request before individual servers
-    assert disagg_arrival <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}"
-    assert disagg_arrival <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}"
-
-    # Context should complete before generation starts
-    assert ctx_server_first_token <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}"
-
-    # Validate internal timing consistency
-    ctx_arrival_time = ctx_metrics["arrival_time"]
-    ctx_first_token_time = ctx_metrics["first_token_time"]
-    gen_arrival_time = gen_metrics["arrival_time"]
-    gen_first_token_time = gen_metrics["first_token_time"]
-
-    assert ctx_arrival_time <= ctx_first_token_time, f"ctx arrival_time > first_token_time in {request_context}"
-    assert gen_arrival_time <= gen_first_token_time, f"gen arrival_time > first_token_time in {request_context}"
-
-    # Test KV cache transfer timing (if present)
-    if "kv_cache_transfer_start" in gen_metrics and "kv_cache_transfer_end" in gen_metrics:
-        kv_start = gen_metrics["kv_cache_transfer_start"]
-        kv_end = gen_metrics["kv_cache_transfer_end"]
-        assert gen_metrics["kv_cache_size"] > 0
-        assert kv_start <= kv_end, f"kv_cache_transfer_start > kv_cache_transfer_end in {request_context}"
-        assert gen_arrival_time <= kv_start, f"gen_arrival_time > kv_cache_transfer_start in {request_context}"
-        assert kv_end <= gen_metrics[
-            "first_scheduled_time"], f"kv_cache_transfer_end > first_scheduled_time in {request_context}"
-
-    return True
-
-
-def get_disagg_server_url_from_cfg(config_file: str) -> str:
-    with open(config_file, 'r') as file:
-        config = yaml.safe_load(file)
-    server_host = config.get('hostname', 'localhost')
-    server_port = config.get('port', 8000)
-    return f"http://{server_host}:{server_port}"
-
-
-def get_test_config(test_desc, example_dir, test_root):
-    """Get test configuration based on test description."""
-    test_configs_root = f"{test_root}/test_configs"
-    config_map = {
-        "2_ranks_diff_max_tokens":
-        (2, f"{test_configs_root}/disagg_config_diff_max_tokens.yaml"),
-        "2_ranks": (2, f"{example_dir}/disagg_config.yaml"),
-        "2_ranks_trt_backend":
-        (2, f"{test_configs_root}/disagg_config_trt_backend.yaml"),
-        "gen_only": (2, f"{test_configs_root}/disagg_config_gen_only.yaml"),
-        "gen_only_trt_backend":
-        (2, f"{test_configs_root}/disagg_config_gen_only_trt_backend.yaml"),
-        "gen_only_bs1":
-        (4, f"{test_configs_root}/disagg_config_gen_only_bs1.yaml"),
-        "4_ranks": (4, f"{test_configs_root}/disagg_config_ctxtp2_gentp1.yaml"),
-        "4_ranks_trt_backend":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp1_trt_backend.yaml"),
-        "cuda_graph":
-        (2, f"{test_configs_root}/disagg_config_cuda_graph_padding.yaml"),
-        "mixed": (2, f"{test_configs_root}/disagg_config_mixed.yaml"),
-        "overlap": (2, f"{test_configs_root}/disagg_config_overlap.yaml"),
-        "perf_metrics": (2, f"{test_configs_root}/disagg_config_metrics.yaml"),
-        "trtllm_sampler":
-        (2, f"{test_configs_root}/disagg_config_trtllm_sampler.yaml"),
-        "load_balance":
-        (4, f"{test_configs_root}/disagg_config_load_balance.yaml"),
-        "cache_aware_balance":
-        (4, f"{test_configs_root}/disagg_config_cache_aware_balance.yaml"),
-        "conditional": (2,
-                        f"{test_configs_root}/disagg_config_conditional.yaml"),
-        "ngram": (2, f"{test_configs_root}/disagg_config_ngram.yaml"),
-        "ctxpp2_genpp2":
-        (4, f"{test_configs_root}/disagg_config_ctxpp2_genpp2.yaml"),
-        "ctxtp2_genpp2":
-        (4, f"{test_configs_root}/disagg_config_ctxtp2_genpp2.yaml"),
-        "ctxpp2_gentp2":
-        (4, f"{test_configs_root}/disagg_config_ctxpp2_gentp2.yaml"),
-        "ctxtp2pp2_gentp2pp2":
-        (8, f"{test_configs_root}/disagg_config_ctxtp2pp2_gentp2pp2.yaml"),
-        "ctxpp4_genpp4":
-        (8, f"{test_configs_root}/disagg_config_ctxpp4_genpp4.yaml"),
-        "ctxpp4_gentp4":
-        (8, f"{test_configs_root}/disagg_config_ctxpp4_gentp4.yaml"),
-        "deepseek_v3_lite_fp8_mpi":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml"
-         ),
-        "deepseek_v3_lite_fp8_ucx":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml"
-         ),
-        "deepseek_v3_lite_fp8_nixl":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml"
-         ),
-        "deepseek_v3_lite_fp8_tp1":
-        (2,
-         f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml"
-         ),
-        "deepseek_v3_lite_fp8_tp1_mtp":
-        (2,
-         f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml"
-         ),
-        "deepseek_v3_lite_fp_8_overlap_dp":
-        (2,
-         f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_overlap_dp.yaml"
-         ),
-        "deepseek_v3_lite_fp8_attention_dp":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml"
-         ),
-        "deepseek_v3_lite_fp_8_attention_dp_overlap":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml"
-         ),
-        "deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml"
-         ),
-        "deepseek_v3_lite_fp8_overlap_cuda_graph":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml"
-         ),
-        "deepseek_v3_lite_fp8_attention_dp_one":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml"
-         ),
-        "deepseek_v3_lite_fp8_attention_dp_one_mtp":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml"
-         ),
-        "deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp":
-        (2,
-         f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml"
-         ),
-        "deepseek_v3_lite_bf16_cache_aware_balance":
-        (4,
-         f"{test_configs_root}/disagg_config_cache_aware_balance_deepseek_v3.yaml"
-         ),
-        "deepseek_v3_lite_bf16_conditional":
-        (2, f"{test_configs_root}/disagg_config_conditional_deepseek_v3.yaml"),
-        "deepseek_v3_lite_fp8_tp1_two_mtp":
-        (2,
-         f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml"
-         ),
-        "deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp":
-        (4,
-         f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_ctxpp2_gentp2.yaml"
-         ),
-    }
-
-    if test_desc not in config_map:
-        raise ValueError(f"Invalid test description: {test_desc}, "
-                         f"valid descriptions are: {config_map.keys()}")
-
-    return config_map[test_desc]
-
-
-def get_extra_llm_config(config, suffix, cwd):
-    extra_llm_config = {
-        'orchestrator_type': 'ray',
-    }
-    for key, value in config.items():
-        if key not in ['num_instances', 'urls']:
-            extra_llm_config[key] = value
-
-    temp_fd, extra_config_file = tempfile.mkstemp(suffix='_%s.yaml' % suffix,
-                                                  dir=cwd)
-    with os.fdopen(temp_fd, 'w') as f:
-        yaml.dump(extra_llm_config, f)
-
-    return extra_config_file
-
-
-def generate_worker_commands(model_path, config, server_config,
-                             extra_config_file, server_role):
-    worker_commands = []
-
-    assert model_path, "model path is required."
-
-    for url in server_config['urls']:
-        host, port = url.split(':')
-        cmd = [
-            'trtllm-serve', model_path, '--host', host, '--port', port,
-            '--backend', config['backend'], '--extra_llm_api_options',
-            extra_config_file, '--server_role', server_role
-        ]
-        worker_commands.append(cmd)
-    return worker_commands
-
-
-def run_client_tests(example_dir,
-                     config_file,
-                     test_desc,
-                     num_iters,
-                     env,
-                     server_start_timeout,
-                     prompt_file,
-                     extra_endpoints_test,
-                     server_url,
-                     workers_proc,
-                     server_proc,
-                     use_ray=False):
-    """Run client tests against the disaggregated server."""
-    client_dir = f"{example_dir}/clients"
-    for _ in range(num_iters):
-        client_cmd = [
-            'python3', f'{client_dir}/disagg_client.py', '-c', f'{config_file}',
-            '-p', f'{client_dir}/{prompt_file}', '--ignore-eos',
-            '--server-start-timeout',
-            str(server_start_timeout)
-        ]
-        if prompt_file == "long_prompts.json":
-            # Use max_tokens 4 for long prompts to reduce test time
-            client_cmd.extend(['--max-tokens', '4'])
-
-        # Prepare poll processes
-        worker_processes = []
-        if use_ray:
-            for proc_cm in workers_proc:
-                worker_processes.append(proc_cm.__enter__())
-        else:
-            worker_processes = [workers_proc]
-
-        poll_procs = worker_processes + [server_proc]
-        check_call(client_cmd, env=env, poll_procs=poll_procs)
-
-        # Streaming client run
-        streaming_client_cmd = client_cmd + [
-            '--streaming', '-o', 'output_streaming.json'
-        ]
-        check_call(streaming_client_cmd, env=env, poll_procs=poll_procs)
-
-        # Run the chat completion endpoint test only for TinyLlama
-        if test_desc == "overlap" or test_desc == "trtllm_sampler":
-            chat_client_cmd = client_cmd + [
-                '-e', 'chat', '-o', 'output_chat.json'
-            ]
-            check_call(chat_client_cmd, env=env, poll_procs=poll_procs)
-
-            streaming_chat_client_cmd = chat_client_cmd + [
-                '--streaming', '-o', 'output_streaming_chat.json'
-            ]
-            check_call(streaming_chat_client_cmd,
-                       env=env,
-                       poll_procs=poll_procs)
-
-        # Skip output verification for long prompts test
-        if prompt_file == "long_prompts.json":
-            continue
-
-        if extra_endpoints_test is not None:
-            extra_endpoints_test(server_url)
-
-        # Verify outputs
-        not_expected_strings = ["Berlin Berlin"]
-
-        output_files = ['output.json', 'output_streaming.json']
-        if test_desc == "overlap" or test_desc == "trtllm_sampler":
-            # Disable streaming chat completion for overlap test
-            # due to bug
-            output_files.extend(['output_chat.json'])
-
-        if test_desc.startswith("gen_only"):
-            continue
-
-        for output_file in output_files:
-            with open(output_file, 'r') as f:
-                content = f.read()
-                if "deepseek_v3_lite" in test_desc or output_file == "output_chat.json":
-                    expected_strings = [
-                        "Berlin", ["Asyncio is a", "Asyncio module in"]
-                    ]
-                else:
-                    expected_strings = [
-                        "The capital of Germany is Berlin",
-                        "Asyncio is a Python library"
-                    ]
-                for expected_string in expected_strings:
-                    if isinstance(expected_string, list):
-                        # At least one of the strings in the list should be found in the content
-                        assert any(
-                            string in content for string in expected_string
-                        ), f"None of the strings in {expected_string} found in {output_file}"
-                    else:
-                        assert expected_string in content, f"Expected string '{expected_string}' not found in {output_file}"
-                for not_expected_string in not_expected_strings:
-                    assert not_expected_string not in content, f"Unexpected string '{not_expected_string}' found in {output_file}"
-
-
-def run_disaggregated_test(example_dir,
-                           test_desc,
-                           num_iters=5,
-                           env=None,
-                           cwd=None,
-                           prompt_file="prompts.json",
-                           extra_endpoints_test: Callable[[str], None] = None,
-                           model_path=None):
-    """Run disaggregated test with given configuration."""
-    cleanup_output_files()
-    run_env = env.copy()
-    run_env["UCX_TLS"] = "^ib"
-
-    num_ranks, config_file = get_test_config(test_desc, example_dir,
-                                             os.path.dirname(__file__))
-
-    use_ray = mpi_disabled()
-    if not use_ray:
-        workers_cmd = [
-            'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
-            str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
-            config_file
-        ]
-    else:
-        pytest.skip(
-            "https://nvbugs/5584607 Ray orchestrator is not supported with NIXL(DEFAULT) cache transceiver backend."
-        )
-        with open(config_file, 'r') as f:
-            config = yaml.safe_load(f)
-
-        if config['backend'] != "pytorch":
-            pytest.skip(
-                "Ray orchestrator is only supported with pytorch backend.")
-
-        extra_config_files = []
-        workers_cmds = []
-        subprocess.run(['ray', 'start', '--head', '--disable-usage-stats'],
-                       check=True)
-
-        # Generate ctx and gen server worker commands
-        ctx_extra_config_file = get_extra_llm_config(config['context_servers'],
-                                                     "ctx", cwd)
-        extra_config_files.append(ctx_extra_config_file)
-        workers_cmds.extend(
-            generate_worker_commands(model_path, config,
-                                     config['context_servers'],
-                                     ctx_extra_config_file, 'context'))
-
-        gen_extra_config_file = get_extra_llm_config(
-            config['generation_servers'], "gen", cwd)
-        extra_config_files.append(gen_extra_config_file)
-        workers_cmds.extend(
-            generate_worker_commands(model_path, config,
-                                     config['generation_servers'],
-                                     gen_extra_config_file, 'generation'))
-
-    server_start_timeout = 1200
-    server_cmd = [
-        'trtllm-serve', 'disaggregated', '--server_start_timeout',
-        str(server_start_timeout), '-c', config_file
-    ]
-    server_url = get_disagg_server_url_from_cfg(config_file)
-
-    try:
-        if not use_ray:
-            with (  # Start workers
-                    open('output_workers.log', 'w') as output_workers,
-                    popen(workers_cmd,
-                          stdout=output_workers,
-                          stderr=subprocess.STDOUT,
-                          env=run_env,
-                          cwd=cwd) as workers_proc,
-                    # Start server
-                    open('output_disagg.log', 'w') as output_disagg,
-                    popen(server_cmd,
-                          stdout=output_disagg,
-                          stderr=subprocess.STDOUT,
-                          env=run_env,
-                          cwd=cwd) as server_proc):
-                run_client_tests(example_dir,
-                                 config_file,
-                                 test_desc,
-                                 num_iters,
-                                 env,
-                                 server_start_timeout,
-                                 prompt_file,
-                                 extra_endpoints_test,
-                                 server_url,
-                                 workers_proc,
-                                 server_proc,
-                                 use_ray=False)
-
-        else:
-            workers_proc = []
-            with contextlib.ExitStack() as stack:
-                workers_log = stack.enter_context(
-                    open('output_workers.log', 'w'))
-
-                for cmd in workers_cmds:
-                    proc = stack.enter_context(
-                        popen(
-                            cmd,
-                            stdout=workers_log,
-                            stderr=subprocess.STDOUT,
-                            env=run_env,
-                            cwd=cwd,
-                        ))
-                    workers_proc.append(proc)
-
-                output_disagg = stack.enter_context(
-                    open('output_disagg.log', 'w'))
-                server_proc = stack.enter_context(
-                    popen(server_cmd,
-                          stdout=output_disagg,
-                          stderr=subprocess.STDOUT,
-                          env=run_env,
-                          cwd=cwd))
-
-                if not wait_for_server("localhost",
-                                       8000,
-                                       timeout_seconds=server_start_timeout):
-                    raise RuntimeError(
-                        f"Disaggregated server failed to start within {server_start_timeout} seconds"
-                    )
-
-                run_client_tests(example_dir,
-                                 config_file,
-                                 test_desc,
-                                 num_iters,
-                                 env,
-                                 server_start_timeout,
-                                 prompt_file,
-                                 extra_endpoints_test,
-                                 server_url,
-                                 workers_proc,
-                                 server_proc,
-                                 use_ray=True)
-    except Exception:
-        # Print outputs on error
-        logger.error("-------- Workers output --------")
-        with open('output_workers.log', 'r') as f:
-            logger.error(f.read())
-
-        logger.error("-------- Disagg server output --------")
-        with open('output_disagg.log', 'r') as f:
-            logger.error(f.read())
-        raise
-    finally:
-        if use_ray:
-            subprocess.run(['ray', 'stop', '--force'], check=False)
-            for extra_file in extra_config_files:
-                if os.path.exists(extra_file):
-                    os.remove(extra_file)
-        elif 'server_proc' in locals() and 'workers_proc' in locals():
-            server_proc.terminate()
-            workers_proc.terminate()
-            server_proc.wait()
-            workers_proc.wait()
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_diff_max_tokens(disaggregated_test_root,
-                                       disaggregated_example_root, llm_venv,
-                                       llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "2_ranks_diff_max_tokens",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory(),
-                           prompt_file="long_prompts.json")
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_single_gpu_with_mpirun(disaggregated_test_root,
-                                              disaggregated_example_root,
-                                              llm_venv, llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "2_ranks",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_single_gpu_with_mpirun_trt_backend(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "2_ranks_trt_backend",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_benchmark_gen_only(disaggregated_test_root,
-                                          disaggregated_example_root, llm_venv,
-                                          llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    env = llm_venv._new_env.copy()
-    env['TRTLLM_DISAGG_BENCHMARK_GEN_ONLY'] = '1'
-    run_disaggregated_test(disaggregated_example_root,
-                           "gen_only",
-                           env=env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_benchmark_gen_only_trt_backend(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    env = llm_venv._new_env.copy()
-    env['TRTLLM_DISAGG_BENCHMARK_GEN_ONLY'] = '1'
-    run_disaggregated_test(disaggregated_example_root,
-                           "gen_only_trt_backend",
-                           env=env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_genbs1(disaggregated_test_root,
-                              disaggregated_example_root, llm_venv,
-                              llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    env = llm_venv._new_env.copy()
-    env['TRTLLM_DISAGG_BENCHMARK_GEN_ONLY'] = '1'
-    run_disaggregated_test(disaggregated_example_root,
-                           "gen_only_bs1",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.skip_less_device(2)
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_multi_gpu_with_mpirun(disaggregated_test_root,
-                                             disaggregated_example_root,
-                                             llm_venv, llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "4_ranks",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.skip_less_device(2)
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_multi_gpu_with_mpirun_trt_backend(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "4_ranks_trt_backend",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_cuda_graph(disaggregated_test_root, llm_venv,
-                                  disaggregated_example_root, llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "cuda_graph",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_mixed(disaggregated_test_root, llm_venv,
-                             disaggregated_example_root, llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "mixed",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_overlap(disaggregated_test_root, llm_venv,
-                               disaggregated_example_root, llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "overlap",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_perf_metrics(disaggregated_test_root, llm_venv,
-                                    disaggregated_example_root,
-                                    llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    def extra_endpoints_test(server_url: str):
-        import json
-        import urllib.request
-
-        with urllib.request.urlopen(f"{server_url}/perf_metrics",
-                                    timeout=10) as resp:
-            assert resp.status == 200
-            perf_metrics = json.load(resp)
-        assert len(perf_metrics) > 0
-        item = perf_metrics[0]
-
-        # Use helper function to validate all timing metrics comprehensively
-        validate_timing_metrics(item, "perf_metrics test")
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "perf_metrics",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory(),
-                           extra_endpoints_test=extra_endpoints_test)
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_kv_cache_time_output(disaggregated_test_root, llm_venv,
-                                            disaggregated_example_root,
-                                            llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    output_path = os.path.join(llm_venv.get_working_directory(), "cache_time")
-    run_disaggregated_test(disaggregated_example_root,
-                           "perf_metrics",
-                           env=llm_venv._new_env
-                           | {"TRTLLM_KVCACHE_TIME_OUTPUT_PATH": output_path},
-                           cwd=llm_venv.get_working_directory())
-    assert os.path.isdir(output_path)
-    send_file = os.path.join(output_path, "rank_0_send.csv")
-    recv_file = os.path.join(output_path, "rank_1_recv.csv")
-    assert os.path.exists(send_file)
-    assert os.path.exists(recv_file)
-    with open(send_file, "r") as f:
-        lines = f.readlines()
-        assert len(lines) > 1
-        assert lines[0].startswith(
-            "RequestID,RequestInfo,Preparation,Preprocess,Transmissions,Postprocess"
-        )
-        assert ",Delay,Duration,Bandwidth(Gbps)" in lines[0]
-        # get a send sample and match the recv
-        sample = lines[1].split(',')
-        assert len(sample) >= 9
-    with open(recv_file, "r") as f:
-        lines = f.readlines()
-        assert len(lines) > 1
-        matched = False
-        for line in lines:
-            sample_recv = line.split(',')
-            if sample_recv[0] == sample[0]:
-                matched = True
-                assert float(sample_recv[1]) <= float(sample[1])
-                break
-        assert matched
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_trtllm_sampler(disaggregated_test_root, llm_venv,
-                                      disaggregated_example_root,
-                                      llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "trtllm_sampler",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_load_balance(disaggregated_test_root, llm_venv,
-                                    disaggregated_example_root,
-                                    llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "load_balance",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_cache_aware_balance(disaggregated_test_root, llm_venv,
-                                           disaggregated_example_root,
-                                           llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "cache_aware_balance",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_conditional(disaggregated_test_root, llm_venv,
-                                   disaggregated_example_root,
-                                   llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "conditional",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_ngram(disaggregated_test_root, llm_venv,
-                             disaggregated_example_root, llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    run_disaggregated_test(disaggregated_example_root,
-                           "ngram",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_ctxpp2_genpp2(disaggregated_test_root, llm_venv,
-                                     disaggregated_example_root,
-                                     llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    run_disaggregated_test(disaggregated_example_root,
-                           "ctxpp2_genpp2",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory(),
-                           model_path=llama_model_root)
-
-
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_ctxtp2_genpp2(disaggregated_test_root, llm_venv,
-                                     disaggregated_example_root,
-                                     llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    run_disaggregated_test(disaggregated_example_root,
-                           "ctxtp2_genpp2",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory(),
-                           model_path=llama_model_root)
-
-
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_ctxpp2_gentp2(disaggregated_test_root, llm_venv,
-                                     disaggregated_example_root,
-                                     llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    run_disaggregated_test(disaggregated_example_root,
-                           "ctxpp2_gentp2",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory(),
-                           model_path=llama_model_root)
-
-
-@pytest.mark.skip_less_device(8)
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_ctxtp2pp2_gentp2pp2(disaggregated_test_root, llm_venv,
-                                           disaggregated_example_root,
-                                           llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    run_disaggregated_test(disaggregated_example_root,
-                           "ctxtp2pp2_gentp2pp2",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.skip_less_device(8)
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_ctxpp4_genpp4(disaggregated_test_root, llm_venv,
-                                     disaggregated_example_root,
-                                     llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    run_disaggregated_test(disaggregated_example_root,
-                           "ctxpp4_genpp4",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-#tiny llama pp4 will have uneven layer per pp. pp4
-@pytest.mark.skip_less_device(8)
-@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
-                         indirect=True)
-def test_disaggregated_ctxpp4_gentp4(disaggregated_test_root, llm_venv,
-                                     disaggregated_example_root,
-                                     llama_model_root):
-    src_dst_dict = {
-        llama_model_root:
-        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    run_disaggregated_test(disaggregated_example_root,
-                           "ctxpp4_gentp4",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory(),
-                           model_path=llama_model_root)
-
-
-@skip_no_hopper
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_mpi(disaggregated_test_root,
-                                                disaggregated_example_root,
-                                                llm_venv,
-                                                deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    env = llm_venv._new_env.copy()
-    env["TRTLLM_USE_MPI_KVCACHE"] = "1"
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_mpi",
-                           env=env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_tp1",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_tp1_mtp",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.mark.skip_less_device(4)
-@skip_no_hopper
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    #add one mtp layer, pp rank0 will have 15 layer, pp rank 1 will have 16 layers.
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory(),
-                           model_path=deepseek_v3_model_root)
-
-
-@skip_no_hopper
-@skip_arm
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_ucx(disaggregated_test_root,
-                                                disaggregated_example_root,
-                                                llm_venv,
-                                                deepseek_v3_model_root):
-
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    env = llm_venv._new_env.copy()
-    env["TRTLLM_USE_UCX_KVCACHE"] = "1"
-    env["UCX_TLS"] = "^ib"
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_ucx",
-                           env=env,
-                           cwd=llm_venv.get_working_directory(),
-                           model_path=deepseek_v3_model_root)
-
-
-@skip_no_hopper
-@skip_arm
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_nixl(disaggregated_test_root,
-                                                 disaggregated_example_root,
-                                                 llm_venv,
-                                                 deepseek_v3_model_root):
-
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    env = llm_venv._new_env.copy()
-    env["TRTLLM_USE_NIXL_KVCACHE"] = "1"
-    env["UCX_TLS"] = "^ib"
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_nixl",
-                           env=env,
-                           cwd=llm_venv.get_working_directory(),
-                           model_path=deepseek_v3_model_root)
-
-
-@skip_no_hopper
-@skip_arm
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-    env = llm_venv._new_env.copy()
-    env["TRTLLM_USE_UCX_KVCACHE"] = "1"
-    env["UCX_TLS"] = "^ib"
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_tp1",
-                           env=env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_attention_dp(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_attention_dp",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap(
-        disaggregated_test_root, llm_venv, disaggregated_example_root,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp_8_attention_dp_overlap",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(
-        disaggregated_example_root,
-        "deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph",
-        env=llm_venv._new_env,
-        cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_overlap_cuda_graph",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_attention_dp_one",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_attention_dp_one_mtp",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.skip_less_device(4)
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(
-        disaggregated_example_root,
-        "deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp",
-        env=llm_venv._new_env,
-        cwd=llm_venv.get_working_directory(),
-        model_path=deepseek_v3_model_root)
-
-
-@skip_no_hopper
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-bf16'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/bf16",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_bf16_cache_aware_balance",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-bf16'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_bf16_conditional(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/bf16",
-    }
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_bf16_conditional",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@skip_no_hopper
-@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
-                         indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        deepseek_v3_model_root):
-    src_dst_dict = {
-        deepseek_v3_model_root:
-        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
-    }
-
-    for src, dst in src_dst_dict.items():
-        if not os.path.islink(dst):
-            os.makedirs(os.path.dirname(dst), exist_ok=True)
-            os.symlink(src, dst, target_is_directory=True)
-
-    run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8_tp1_two_mtp",
-                           env=llm_venv._new_env,
-                           cwd=llm_venv.get_working_directory())
-
-
-@pytest.fixture(scope="module")
-def benchmark_root():
-    llm_root = os.getenv("LLM_ROOT")
-    return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts")
-
-
-@pytest.fixture(scope="module")
-def shared_gpt_path():
-    DEFAULT_LLM_MODEL_ROOT = os.path.join("/scratch.trt_llm_data", "llm-models")
-    LLM_MODELS_ROOT = os.environ.get("LLM_MODELS_ROOT", DEFAULT_LLM_MODEL_ROOT)
-    return os.path.join(LLM_MODELS_ROOT, "datasets",
-                        "ShareGPT_V3_unfiltered_cleaned_split.json")
-
-
-@pytest.fixture(scope="function")
-def benchmark_model_root(request):
-    models_root = llm_models_root()
-    if (request.param == "DeepSeek-V3-Lite-fp8"):
-        model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "fp8")
-    elif (request.param == "DeepSeek-V3-Lite-bf16"):
-        model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "bf16")
-    elif request.param == "llama-v3-8b-hf":
-        model_path = os.path.join(models_root, "llama-models-v3", "8B")
-    elif request.param == "llama-3.1-8b-instruct-hf-fp8":
-        model_path = os.path.join(models_root, "llama-3.1-model",
-                                  "Llama-3.1-8B-Instruct-FP8")
-    else:
-        raise ValueError(f"Failed to find the model: {request.param}")
-    return model_path
-
-
-def run_disaggregated_benchmark(example_dir,
-                                config_file,
-                                benchmark_root,
-                                benchmark_model_root,
-                                shared_gpt_path,
-                                env=None,
-                                cwd=None):
-    """Run disaggregated test with given configuration."""
-    run_env = env.copy()
-    run_env["UCX_TLS"] = "^ib"
-    num_rank = 2
-    workers_cmd = [
-        'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
-        str(num_rank), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
-        config_file
-    ]
-
-    server_start_timeout = 900
-    server_cmd = [
-        'trtllm-serve', 'disaggregated', '--server_start_timeout',
-        str(server_start_timeout), '-c', config_file
-    ]
-    try:
-        with (  # Start workers
-                open('output_workers.log', 'w') as output_workers,
-                popen(workers_cmd,
-                      stdout=output_workers,
-                      stderr=subprocess.STDOUT,
-                      env=run_env,
-                      cwd=cwd) as workers_proc,
-                # Start server
-                open('output_disagg.log', 'w') as output_disagg,
-                popen(server_cmd,
-                      stdout=output_disagg,
-                      stderr=subprocess.STDOUT,
-                      env=run_env,
-                      cwd=cwd) as server_proc):
-            # Ensure the sever has started
-            client_dir = f"{example_dir}/clients"
-            client_cmd = [
-                'python3', f'{client_dir}/disagg_client.py', '-c',
-                f'{example_dir}/disagg_config.yaml', '-p',
-                f'{client_dir}/prompts.json', '--ignore-eos',
-                '--server-start-timeout',
-                str(server_start_timeout)
-            ]
-            # Warm up
-            check_call(client_cmd,
-                       env=env,
-                       poll_procs=[workers_proc, server_proc])
-            # Start Benchmark
-            benchmark_script = os.path.join(benchmark_root,
-                                            "benchmark_serving.py")
-            benchmark_cmd = [
-                'python3',
-                benchmark_script,
-                '--model',
-                benchmark_model_root,
-                '--tokenizer',
-                benchmark_model_root,
-                '--dataset-name',
-                'random',
-                '--dataset-path',
-                shared_gpt_path,
-                '--random-input-len',
-                '256',
-                '--random-output-len',
-                '64',
-                '--random-prefix-len',
-                '0',
-                '--num-prompts',
-                '320',
-                '--max-concurrency',
-                '32',
-                '--host',
-                'localhost',
-                '--port',
-                '8000',
-                '--ignore-eos',
-                '--no-test-input',
-                '--percentile-metrics',
-                'e2el,ttft',
-            ]
-            # warm up
-            check_call(benchmark_cmd, env=env)
-            output = check_output(benchmark_cmd, env=env)
-            e2el_pattern = r"Median E2EL \(ms\):\s*(\d+\.?\d*)"
-            ttft_pattern = r"Median TTFT \(ms\):\s*(\d+\.?\d*)"
-            e2el_match = re.search(e2el_pattern, output)
-            ttft_match = re.search(ttft_pattern, output)
-            if e2el_match and ttft_match:
-                median_e2el = float(e2el_match.group(1))
-                median_ttft = float(ttft_match.group(1))
-                return median_e2el, median_ttft
-            else:
-                raise ValueError("No benchmark result found")
-
-    except Exception:
-        # Print outputs on error
-        logger.error("-------- Workers output --------")
-        with open('output_workers.log', 'r') as f:
-            logger.error(f.read())
-
-        logger.error("-------- Disagg server output --------")
-        with open('output_disagg.log', 'r') as f:
-            logger.error(f.read())
-        raise
-    finally:
-        server_proc.terminate()
-        workers_proc.terminate()
-        server_proc.wait()
-        workers_proc.wait()
-
-
-def get_config_for_benchmark(model_root, backend):
-    serve_config = {
-        "model": model_root,
-        "hostname": "localhost",
-        "port": 8000,
-        "backend": "pytorch",
-        "context_servers": {
-            "num_instances": 1,
-            "max_batch_size": 2,
-            "max_num_tokens": 384,
-            "max_seq_len": 384,
-            "tensor_parallel_size": 1,
-            "pipeline_parallel_size": 1,
-            "disable_overlap_scheduler": True,
-            "cache_transceiver_config": {
-                "backend": backend,
-                "max_tokens_in_buffer": 512,
-            },
-            "urls": ["localhost:8001"]
-        },
-        "generation_servers": {
-            "num_instances": 1,
-            "tensor_parallel_size": 1,
-            "pipeline_parallel_size": 1,
-            "max_batch_size": 2,
-            "max_num_tokens": 384,
-            "max_seq_len": 384,
-            "cache_transceiver_config": {
-                "backend": backend,
-                "max_tokens_in_buffer": 512,
-            },
-            "urls": ["localhost:8002"]
-        }
-    }
-    return serve_config
-
-
-@pytest.mark.parametrize("benchmark_model_root", [
-    'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf',
-    'llama-3.1-8b-instruct-hf-fp8'
-],
-                         indirect=True)
-def test_disaggregated_benchmark_on_diff_backends(
-        disaggregated_test_root, disaggregated_example_root, llm_venv,
-        benchmark_model_root, benchmark_root, shared_gpt_path):
-    if "DeepSeek-V3-Lite" in benchmark_model_root and "fp8" in benchmark_model_root and get_sm_version(
-    ) != 90:
-        pytest.skip("The test should only run on Hopper")
-    nixl_config = get_config_for_benchmark(benchmark_model_root, "NIXL")
-    ucx_config = get_config_for_benchmark(benchmark_model_root, "UCX")
-    temp_dir = tempfile.TemporaryDirectory()
-    nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml")
-    ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml")
-    with open(nixl_config_path, 'w', encoding='utf-8') as f:
-        yaml.dump(nixl_config, f)
-    with open(ucx_config_path, 'w', encoding='utf-8') as f:
-        yaml.dump(ucx_config, f)
-
-    env = llm_venv._new_env.copy()
-    nixl_e2el, nixl_ttft = run_disaggregated_benchmark(
-        disaggregated_example_root,
-        nixl_config_path,
-        benchmark_root,
-        benchmark_model_root,
-        shared_gpt_path,
-        env=env,
-        cwd=llm_venv.get_working_directory())
-    ucx_e2el, ucx_ttft = run_disaggregated_benchmark(
-        disaggregated_example_root,
-        ucx_config_path,
-        benchmark_root,
-        benchmark_model_root,
-        shared_gpt_path,
-        env=env,
-        cwd=llm_venv.get_working_directory())
-    print(f"Nixl E2EL: {nixl_e2el} ms, UCX E2EL: {ucx_e2el} ms")
-    print(f"Nixl TTFT: {nixl_ttft} ms, UCX TTFT: {ucx_ttft} ms")
-
-    assert ucx_e2el > 0 and nixl_e2el > 0 and nixl_e2el < 1.05 * ucx_e2el
-    assert ucx_ttft > 0 and nixl_ttft > 0 and nixl_ttft < 1.05 * ucx_ttft
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_benchmark.py b/tests/integration/defs/disaggregated/test_disaggregated_benchmark.py
new file mode 100644
index 00000000000..3f36b638003
--- /dev/null
+++ b/tests/integration/defs/disaggregated/test_disaggregated_benchmark.py
@@ -0,0 +1,255 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import subprocess
+import tempfile
+
+import pytest
+import yaml
+from defs.conftest import get_sm_version, llm_models_root
+from defs.trt_test_alternative import check_call, check_output, popen
+
+from tensorrt_llm.logger import logger
+
+
+@pytest.fixture(scope="module")
+def benchmark_root():
+    llm_root = os.getenv("LLM_ROOT")
+    return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts")
+
+
+@pytest.fixture(scope="module")
+def shared_gpt_path():
+    DEFAULT_LLM_MODEL_ROOT = os.path.join("/scratch.trt_llm_data", "llm-models")
+    LLM_MODELS_ROOT = os.environ.get("LLM_MODELS_ROOT", DEFAULT_LLM_MODEL_ROOT)
+    return os.path.join(LLM_MODELS_ROOT, "datasets",
+                        "ShareGPT_V3_unfiltered_cleaned_split.json")
+
+
+@pytest.fixture(scope="function")
+def benchmark_model_root(request):
+    models_root = llm_models_root()
+    if (request.param == "DeepSeek-V3-Lite-fp8"):
+        model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "fp8")
+    elif (request.param == "DeepSeek-V3-Lite-bf16"):
+        model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "bf16")
+    elif request.param == "llama-v3-8b-hf":
+        model_path = os.path.join(models_root, "llama-models-v3", "8B")
+    elif request.param == "llama-3.1-8b-instruct-hf-fp8":
+        model_path = os.path.join(models_root, "llama-3.1-model",
+                                  "Llama-3.1-8B-Instruct-FP8")
+    else:
+        raise ValueError(f"Failed to find the model: {request.param}")
+    return model_path
+
+
+def run_disaggregated_benchmark(example_dir,
+                                config_file,
+                                benchmark_root,
+                                benchmark_model_root,
+                                shared_gpt_path,
+                                env=None,
+                                cwd=None):
+    """Run disaggregated benchmark with given configuration."""
+    run_env = env.copy()
+    run_env["UCX_TLS"] = "^ib"
+    num_rank = 2
+    workers_cmd = [
+        'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
+        str(num_rank), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
+        config_file
+    ]
+
+    server_start_timeout = 900
+    server_cmd = [
+        'trtllm-serve', 'disaggregated', '--server_start_timeout',
+        str(server_start_timeout), '-c', config_file
+    ]
+    try:
+        with (  # Start workers
+                open('output_workers.log', 'w') as output_workers,
+                popen(workers_cmd,
+                      stdout=output_workers,
+                      stderr=subprocess.STDOUT,
+                      env=run_env,
+                      cwd=cwd) as workers_proc,
+                # Start server
+                open('output_disagg.log', 'w') as output_disagg,
+                popen(server_cmd,
+                      stdout=output_disagg,
+                      stderr=subprocess.STDOUT,
+                      env=run_env,
+                      cwd=cwd) as server_proc):
+            # Ensure the server has started
+            client_dir = f"{example_dir}/clients"
+            client_cmd = [
+                'python3', f'{client_dir}/disagg_client.py', '-c',
+                f'{example_dir}/disagg_config.yaml', '-p',
+                f'{client_dir}/prompts.json', '--ignore-eos',
+                '--server-start-timeout',
+                str(server_start_timeout)
+            ]
+            # Warm up
+            check_call(client_cmd,
+                       env=env,
+                       poll_procs=[workers_proc, server_proc])
+            # Start Benchmark
+            benchmark_script = os.path.join(benchmark_root,
+                                            "benchmark_serving.py")
+            benchmark_cmd = [
+                'python3',
+                benchmark_script,
+                '--model',
+                benchmark_model_root,
+                '--tokenizer',
+                benchmark_model_root,
+                '--dataset-name',
+                'random',
+                '--dataset-path',
+                shared_gpt_path,
+                '--random-input-len',
+                '256',
+                '--random-output-len',
+                '64',
+                '--random-prefix-len',
+                '0',
+                '--num-prompts',
+                '320',
+                '--max-concurrency',
+                '32',
+                '--host',
+                'localhost',
+                '--port',
+                '8000',
+                '--ignore-eos',
+                '--no-test-input',
+                '--percentile-metrics',
+                'e2el,ttft',
+            ]
+            # warm up
+            check_call(benchmark_cmd, env=env)
+            output = check_output(benchmark_cmd, env=env)
+            e2el_pattern = r"Median E2EL \(ms\):\s*(\d+\.?\d*)"
+            ttft_pattern = r"Median TTFT \(ms\):\s*(\d+\.?\d*)"
+            e2el_match = re.search(e2el_pattern, output)
+            ttft_match = re.search(ttft_pattern, output)
+            if e2el_match and ttft_match:
+                median_e2el = float(e2el_match.group(1))
+                median_ttft = float(ttft_match.group(1))
+                return median_e2el, median_ttft
+            else:
+                raise ValueError("No benchmark result found")
+
+    except Exception:
+        # Print outputs on error
+        logger.error("-------- Workers output --------")
+        with open('output_workers.log', 'r') as f:
+            logger.error(f.read())
+
+        logger.error("-------- Disagg server output --------")
+        with open('output_disagg.log', 'r') as f:
+            logger.error(f.read())
+        raise
+    finally:
+        server_proc.terminate()
+        workers_proc.terminate()
+        server_proc.wait()
+        workers_proc.wait()
+
+
+def get_config_for_benchmark(model_root, backend):
+    """Generate config for benchmark test."""
+    serve_config = {
+        "model": model_root,
+        "hostname": "localhost",
+        "port": 8000,
+        "backend": "pytorch",
+        "context_servers": {
+            "num_instances": 1,
+            "max_batch_size": 2,
+            "max_num_tokens": 384,
+            "max_seq_len": 384,
+            "tensor_parallel_size": 1,
+            "pipeline_parallel_size": 1,
+            "disable_overlap_scheduler": True,
+            "cache_transceiver_config": {
+                "backend": backend,
+                "max_tokens_in_buffer": 512,
+            },
+            "urls": ["localhost:8001"]
+        },
+        "generation_servers": {
+            "num_instances": 1,
+            "tensor_parallel_size": 1,
+            "pipeline_parallel_size": 1,
+            "max_batch_size": 2,
+            "max_num_tokens": 384,
+            "max_seq_len": 384,
+            "cache_transceiver_config": {
+                "backend": backend,
+                "max_tokens_in_buffer": 512,
+            },
+            "urls": ["localhost:8002"]
+        }
+    }
+    return serve_config
+
+
+@pytest.mark.parametrize("benchmark_model_root", [
+    'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf',
+    'llama-3.1-8b-instruct-hf-fp8'
+],
+                         indirect=True)
+def test_disaggregated_benchmark_on_diff_backends(
+        disaggregated_test_root, disaggregated_example_root, llm_venv,
+        benchmark_model_root, benchmark_root, shared_gpt_path):
+    """Benchmark test comparing NIXL vs UCX cache transceiver backends."""
+    if "DeepSeek-V3-Lite" in benchmark_model_root and "fp8" in benchmark_model_root and get_sm_version(
+    ) != 90:
+        pytest.skip("The test should only run on Hopper")
+    nixl_config = get_config_for_benchmark(benchmark_model_root, "NIXL")
+    ucx_config = get_config_for_benchmark(benchmark_model_root, "UCX")
+    temp_dir = tempfile.TemporaryDirectory()
+    nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml")
+    ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml")
+    with open(nixl_config_path, 'w', encoding='utf-8') as f:
+        yaml.dump(nixl_config, f)
+    with open(ucx_config_path, 'w', encoding='utf-8') as f:
+        yaml.dump(ucx_config, f)
+
+    env = llm_venv._new_env.copy()
+    nixl_e2el, nixl_ttft = run_disaggregated_benchmark(
+        disaggregated_example_root,
+        nixl_config_path,
+        benchmark_root,
+        benchmark_model_root,
+        shared_gpt_path,
+        env=env,
+        cwd=llm_venv.get_working_directory())
+    ucx_e2el, ucx_ttft = run_disaggregated_benchmark(
+        disaggregated_example_root,
+        ucx_config_path,
+        benchmark_root,
+        benchmark_model_root,
+        shared_gpt_path,
+        env=env,
+        cwd=llm_venv.get_working_directory())
+    print(f"Nixl E2EL: {nixl_e2el} ms, UCX E2EL: {ucx_e2el} ms")
+    print(f"Nixl TTFT: {nixl_ttft} ms, UCX TTFT: {ucx_ttft} ms")
+
+    assert ucx_e2el > 0 and nixl_e2el > 0 and nixl_e2el < 1.05 * ucx_e2el
+    assert ucx_ttft > 0 and nixl_ttft > 0 and nixl_ttft < 1.05 * ucx_ttft
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_parametrized.py b/tests/integration/defs/disaggregated/test_disaggregated_parametrized.py
new file mode 100644
index 00000000000..217f7a37620
--- /dev/null
+++ b/tests/integration/defs/disaggregated/test_disaggregated_parametrized.py
@@ -0,0 +1,1464 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from copy import deepcopy
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+import pytest
+import yaml
+from defs.conftest import skip_arm, skip_no_hopper
+from defs.trt_test_alternative import check_call
+
+
+# Utility functions for disaggregated tests
+def cleanup_output_files():
+    """Clean up output files from previous runs."""
+    for file in ['output.json', 'output_streaming.json']:
+        try:
+            os.remove(file)
+        except FileNotFoundError:
+            pass
+
+
+def get_disagg_server_url_from_cfg(config_file: str) -> str:
+    """Extract server URL from configuration file.
+
+    Args:
+        config_file: Path to the YAML configuration file.
+
+    Returns:
+        Server URL in format "http://hostname:port"
+    """
+    with open(config_file, 'r') as file:
+        config = yaml.safe_load(file)
+    server_host = config.get('hostname', 'localhost')
+    server_port = config.get('port', 8000)
+    return f"http://{server_host}:{server_port}"
+
+
+def validate_timing_metrics(perf_metrics_item, request_context=""):
+    """
+    Helper function to validate timing metrics relationships.
+
+    Args:
+        perf_metrics_item: A single performance metrics item from the /perf_metrics endpoint
+        request_context: String context for error messages (e.g., "request 1", "streaming")
+    """
+    # Validate basic structure
+    required_keys = [
+        "ctx_server", "gen_server", "ctx_perf_metrics", "gen_perf_metrics",
+        "disagg_server_arrival_time", "disagg_server_first_token_time"
+    ]
+    for key in required_keys:
+        assert key in perf_metrics_item, f"Missing key: {key} in {request_context}"
+
+    assert perf_metrics_item["ctx_perf_metrics"][
+        "ctx_request_id"] == perf_metrics_item["gen_perf_metrics"][
+            "ctx_request_id"]
+
+    # Extract timing metrics
+    ctx_metrics = perf_metrics_item["ctx_perf_metrics"]["perf_metrics"][
+        "timing_metrics"]
+    gen_metrics = perf_metrics_item["gen_perf_metrics"]["perf_metrics"][
+        "timing_metrics"]
+    disagg_arrival = perf_metrics_item["disagg_server_arrival_time"]
+    disagg_first_token = perf_metrics_item["disagg_server_first_token_time"]
+
+    # Validate disaggregated server timing metrics
+    assert disagg_arrival is not None, f"disagg_server_arrival_time is None in {request_context}"
+    assert disagg_first_token is not None, f"disagg_server_first_token_time is None in {request_context}"
+    assert isinstance(
+        disagg_arrival,
+        (int, float
+         )), f"disagg_server_arrival_time is not numeric in {request_context}"
+    assert isinstance(
+        disagg_first_token, (int, float)
+    ), f"disagg_server_first_token_time is not numeric in {request_context}"
+    assert disagg_arrival > 0, f"disagg_server_arrival_time is not positive in {request_context}"
+    assert disagg_first_token > 0, f"disagg_server_first_token_time is not positive in {request_context}"
+    assert disagg_arrival <= disagg_first_token, f"disagg_server_arrival_time > disagg_server_first_token_time in {request_context}"
+
+    # Validate server-level timing metrics for context server
+    ctx_server_arrival = ctx_metrics.get("server_arrival_time")
+    ctx_server_first_token = ctx_metrics.get("server_first_token_time")
+    assert ctx_server_arrival is not None, f"ctx server_arrival_time is None in {request_context}"
+    assert ctx_server_first_token is not None, f"ctx server_first_token_time is None in {request_context}"
+    assert isinstance(
+        ctx_server_arrival,
+        (int,
+         float)), f"ctx server_arrival_time is not numeric in {request_context}"
+    assert isinstance(
+        ctx_server_first_token,
+        (int, float
+         )), f"ctx server_first_token_time is not numeric in {request_context}"
+    assert ctx_server_arrival <= ctx_server_first_token, f"ctx server_arrival_time > server_first_token_time in {request_context}"
+    assert ctx_metrics["last_token_time"] - ctx_server_first_token < 1e-3
+
+    # Validate server-level timing metrics for generation server
+    gen_server_arrival = gen_metrics.get("server_arrival_time")
+    gen_server_first_token = gen_metrics.get("server_first_token_time")
+    assert gen_server_arrival is not None, f"gen server_arrival_time is None in {request_context}"
+    assert gen_server_first_token is not None, f"gen server_first_token_time is None in {request_context}"
+    assert isinstance(
+        gen_server_arrival,
+        (int,
+         float)), f"gen server_arrival_time is not numeric in {request_context}"
+    assert isinstance(
+        gen_server_first_token,
+        (int, float
+         )), f"gen server_first_token_time is not numeric in {request_context}"
+    assert gen_server_arrival <= gen_server_first_token, f"gen server_arrival_time > server_first_token_time in {request_context}"
+
+    # Validate timing relationships between different levels
+    # Disaggregated server should receive request before individual servers
+    assert disagg_arrival <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}"
+    assert disagg_arrival <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}"
+
+    # Context should complete before generation starts
+    assert ctx_server_first_token <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}"
+
+    # Validate internal timing consistency
+    ctx_arrival_time = ctx_metrics["arrival_time"]
+    ctx_first_token_time = ctx_metrics["first_token_time"]
+    gen_arrival_time = gen_metrics["arrival_time"]
+    gen_first_token_time = gen_metrics["first_token_time"]
+
+    assert ctx_arrival_time <= ctx_first_token_time, f"ctx arrival_time > first_token_time in {request_context}"
+    assert gen_arrival_time <= gen_first_token_time, f"gen arrival_time > first_token_time in {request_context}"
+
+    # Test KV cache transfer timing (if present)
+    if "kv_cache_transfer_start" in gen_metrics and "kv_cache_transfer_end" in gen_metrics:
+        kv_start = gen_metrics["kv_cache_transfer_start"]
+        kv_end = gen_metrics["kv_cache_transfer_end"]
+        assert gen_metrics["kv_cache_size"] > 0
+        assert kv_start <= kv_end, f"kv_cache_transfer_start > kv_cache_transfer_end in {request_context}"
+        assert gen_arrival_time <= kv_start, f"gen_arrival_time > kv_cache_transfer_start in {request_context}"
+        assert kv_end <= gen_metrics[
+            "first_scheduled_time"], f"kv_cache_transfer_end > first_scheduled_time in {request_context}"
+
+    return True
+
+
+def run_client_tests(example_dir,
+                     config_file,
+                     test_desc,
+                     num_iters,
+                     env,
+                     server_start_timeout,
+                     prompt_file,
+                     extra_endpoints_test,
+                     server_url,
+                     workers_proc,
+                     server_proc,
+                     use_ray=False):
+    """Run client tests against the disaggregated server.
+
+    Args:
+        example_dir: Path to the examples directory
+        config_file: Path to the configuration file
+        test_desc: Test description/name
+        num_iters: Number of iterations to run
+        env: Environment variables
+        server_start_timeout: Timeout for server startup
+        prompt_file: Name of the prompt file to use
+        extra_endpoints_test: Optional callback for extra endpoint tests
+        server_url: URL of the disaggregated server
+        workers_proc: Worker process(es)
+        server_proc: Server process
+        use_ray: Whether Ray orchestrator is being used
+    """
+    client_dir = f"{example_dir}/clients"
+    for _ in range(num_iters):
+        client_cmd = [
+            'python3', f'{client_dir}/disagg_client.py', '-c', f'{config_file}',
+            '-p', f'{client_dir}/{prompt_file}', '--ignore-eos',
+            '--server-start-timeout',
+            str(server_start_timeout)
+        ]
+        if prompt_file == "long_prompts.json":
+            # Use max_tokens 4 for long prompts to reduce test time
+            client_cmd.extend(['--max-tokens', '4'])
+
+        # Prepare poll processes
+        worker_processes = []
+        if use_ray:
+            for proc_cm in workers_proc:
+                worker_processes.append(proc_cm.__enter__())
+        else:
+            worker_processes = [workers_proc]
+
+        poll_procs = worker_processes + [server_proc]
+        check_call(client_cmd, env=env, poll_procs=poll_procs)
+
+        # Streaming client run
+        streaming_client_cmd = client_cmd + [
+            '--streaming', '-o', 'output_streaming.json'
+        ]
+        check_call(streaming_client_cmd, env=env, poll_procs=poll_procs)
+
+        # Run the chat completion endpoint test only for TinyLlama
+        if test_desc == "overlap" or test_desc == "trtllm_sampler":
+            chat_client_cmd = client_cmd + [
+                '-e', 'chat', '-o', 'output_chat.json'
+            ]
+            check_call(chat_client_cmd, env=env, poll_procs=poll_procs)
+
+            streaming_chat_client_cmd = chat_client_cmd + [
+                '--streaming', '-o', 'output_streaming_chat.json'
+            ]
+            check_call(streaming_chat_client_cmd,
+                       env=env,
+                       poll_procs=poll_procs)
+
+        # Skip output verification for long prompts test
+        if prompt_file == "long_prompts.json":
+            continue
+
+        if extra_endpoints_test is not None:
+            extra_endpoints_test(server_url)
+
+        # Verify outputs
+        not_expected_strings = ["Berlin Berlin"]
+
+        output_files = ['output.json', 'output_streaming.json']
+        if test_desc == "overlap" or test_desc == "trtllm_sampler":
+            # Disable streaming chat completion for overlap test
+            # due to bug
+            output_files.extend(['output_chat.json'])
+
+        if test_desc.startswith("gen_only"):
+            continue
+
+        for output_file in output_files:
+            with open(output_file, 'r') as f:
+                content = f.read()
+                if "ds_v3_lite" in test_desc or output_file == "output_chat.json":
+                    expected_strings = [
+                        "Berlin", ["Asyncio is a", "Asyncio module in"]
+                    ]
+                else:
+                    expected_strings = [
+                        "The capital of Germany is Berlin",
+                        "Asyncio is a Python library"
+                    ]
+                for expected_string in expected_strings:
+                    if isinstance(expected_string, list):
+                        # At least one of the strings in the list should be found in the content
+                        assert any(
+                            string in content for string in expected_string
+                        ), f"None of the strings in {expected_string} found in {output_file}"
+                    else:
+                        assert expected_string in content, f"Expected string '{expected_string}' not found in {output_file}"
+                for not_expected_string in not_expected_strings:
+                    assert not_expected_string not in content, f"Unexpected string '{not_expected_string}' found in {output_file}"
+
+
+@dataclass
+class DisaggregatedTestConfig:
+    """Complete configuration for a disaggregated test."""
+    test_name: str
+    model_root: str
+
+    # Global config
+    global_config: dict = field(default_factory=dict)
+    # Ctx config
+    ctx_config: dict = field(default_factory=dict)
+    # Gen config
+    gen_config: dict = field(default_factory=dict)
+
+    # Test specific settings
+    skip_device_count: Optional[int] = None
+    skip_hopper: bool = False
+    skip_arm_arch: bool = False
+    env_vars: Optional[Dict[str, str]] = None
+    prompt_file: str = "prompts.json"
+    num_iters: int = 5
+    extra_validation: Optional[
+        str] = None  # Special validation type: 'perf_metrics', 'kv_cache_time'
+
+    @staticmethod
+    def _deep_merge_dicts(base: dict, override: dict) -> dict:
+        """Deep merge two dictionaries.
+
+        Args:
+            base: Base dictionary to start from
+            override: Dictionary with values to override/add
+
+        Returns:
+            New dictionary with merged values
+        """
+        result = deepcopy(base)
+        for key, value in override.items():
+            if key in result and isinstance(result[key], dict) and isinstance(
+                    value, dict):
+                result[key] = DisaggregatedTestConfig._deep_merge_dicts(
+                    result[key], value)
+            else:
+                result[key] = deepcopy(value)
+        return result
+
+    @classmethod
+    def from_base(
+            cls,
+            base: 'DisaggregatedTestConfig',
+            test_name: str,
+            model_root: Optional[str] = None,
+            global_config: Optional[dict] = None,
+            ctx_config: Optional[dict] = None,
+            gen_config: Optional[dict] = None,
+            skip_device_count: Optional[int] = None,
+            skip_hopper: Optional[bool] = None,
+            skip_arm_arch: Optional[bool] = None,
+            env_vars: Optional[Dict[str, str]] = None,
+            prompt_file: Optional[str] = None,
+            num_iters: Optional[int] = None,
+            extra_validation: Optional[str] = None
+    ) -> 'DisaggregatedTestConfig':
+        """Create a new config based on an existing one with selective overrides.
+
+        Args:
+            base: Base configuration to inherit from
+            test_name: Name for the new test (required)
+            model_root: Override model root (if None, uses base.model_root)
+            global_config: Dictionary to merge into base global_config
+            ctx_config: Dictionary to merge into base ctx_config
+            gen_config: Dictionary to merge into base gen_config
+            skip_device_count: Override skip_device_count (if None, uses base value)
+            skip_hopper: Override skip_hopper (if None, uses base value)
+            skip_arm_arch: Override skip_arm_arch (if None, uses base value)
+            env_vars: Override or merge with base env_vars
+            prompt_file: Override prompt_file (if None, uses base value)
+            num_iters: Override num_iters (if None, uses base value)
+            extra_validation: Override extra_validation (if None, uses base value)
+
+        Returns:
+            New DisaggregatedTestConfig instance
+        """
+        # Deep copy base configs
+        new_global_config = deepcopy(base.global_config)
+        new_ctx_config = deepcopy(base.ctx_config)
+        new_gen_config = deepcopy(base.gen_config)
+        new_env_vars = deepcopy(base.env_vars) if base.env_vars else None
+
+        # Merge provided overrides
+
+        # Remove any parameters from global_config that are already specified in ctx_config or gen_config
+        for key in list(new_global_config.keys()):
+            if (ctx_config is not None
+                    and key in ctx_config) or (gen_config is not None
+                                               and key in gen_config):
+                new_global_config.pop(key, None)
+
+        if global_config:
+            new_global_config = cls._deep_merge_dicts(new_global_config,
+                                                      global_config)
+        if ctx_config:
+            new_ctx_config = cls._deep_merge_dicts(new_ctx_config, ctx_config)
+        if gen_config:
+            new_gen_config = cls._deep_merge_dicts(new_gen_config, gen_config)
+        if env_vars:
+            if new_env_vars:
+                new_env_vars = {**new_env_vars, **env_vars}
+            else:
+                new_env_vars = env_vars.copy()
+
+        return cls(
+            test_name=test_name,
+            model_root=model_root
+            if model_root is not None else base.model_root,
+            global_config=new_global_config,
+            ctx_config=new_ctx_config,
+            gen_config=new_gen_config,
+            skip_device_count=skip_device_count
+            if skip_device_count is not None else base.skip_device_count,
+            skip_hopper=skip_hopper
+            if skip_hopper is not None else base.skip_hopper,
+            skip_arm_arch=skip_arm_arch
+            if skip_arm_arch is not None else base.skip_arm_arch,
+            env_vars=new_env_vars,
+            prompt_file=prompt_file
+            if prompt_file is not None else base.prompt_file,
+            num_iters=num_iters if num_iters is not None else base.num_iters,
+            extra_validation=extra_validation
+            if extra_validation is not None else base.extra_validation,
+        )
+
+    def get_num_ranks(self) -> int:
+        """Calculate total number of ranks needed."""
+        ctx_tp = self.ctx_config.get('tensor_parallel_size', 1)
+        ctx_pp = self.ctx_config.get('pipeline_parallel_size', 1)
+        ctx_num_instances = self.ctx_config.get('num_instances', 1)
+
+        gen_tp = self.gen_config.get('tensor_parallel_size', 1)
+        gen_pp = self.gen_config.get('pipeline_parallel_size', 1)
+        gen_num_instances = self.gen_config.get('num_instances', 1)
+
+        ctx_ranks = ctx_tp * ctx_pp * ctx_num_instances
+        gen_ranks = gen_tp * gen_pp * gen_num_instances
+        return ctx_ranks + gen_ranks
+
+    def generate_yaml_config(self, temp_dir: str) -> str:
+        """Generate a yaml config file from the parameters."""
+        config = self.global_config.copy()
+        config["model"] = self.model_root
+
+        # Add default cache_transceiver_config if not present
+        if "cache_transceiver_config" not in config:
+            config["cache_transceiver_config"] = {"backend": "DEFAULT"}
+
+        if "backend" in config and config["backend"] == "trt":
+            del config["disable_overlap_scheduler"]
+            del config["cuda_graph_config"]
+
+        # Build context servers config
+        context_servers = self.ctx_config.copy()
+
+        ctx_num_instances = self.ctx_config.get('num_instances', 1)
+        context_servers["num_instances"] = ctx_num_instances
+
+        ctx_urls = []
+        base_port = 8001
+        for i in range(ctx_num_instances):
+            ctx_urls.append(f"localhost:{base_port + i}")
+        context_servers["urls"] = ctx_urls
+        config["context_servers"] = context_servers
+
+        # Build generation servers config
+        gen_servers = self.gen_config.copy()
+
+        gen_num_instances = self.gen_config.get('num_instances', 1)
+        gen_servers["num_instances"] = gen_num_instances
+
+        gen_urls = []
+        base_port = 8001 + ctx_num_instances
+        for i in range(gen_num_instances):
+            gen_urls.append(f"localhost:{base_port + i}")
+        gen_servers["urls"] = gen_urls
+
+        # Special handling for gen-only mode
+        if ctx_num_instances == 0 and "backend" in config and config[
+                "backend"] == "pytorch":
+            gen_servers["print_iter_log"] = True
+
+        config["generation_servers"] = gen_servers
+
+        # Write to temporary file
+        config_path = os.path.join(temp_dir, f"{self.test_name}.yaml")
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f, default_flow_style=False)
+
+        return config_path
+
+
+# Define all test configurations
+#
+# Usage: You can create test configs from scratch or use from_base() to inherit from existing configs:
+#
+# Example 1 - Create base config:
+#   base_config = DisaggregatedTestConfig(
+#       test_name="base",
+#       model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+#       global_config={"backend": "pytorch"}
+#   )
+#
+# Example 2 - Create variation with different backend:
+#   trt_config = DisaggregatedTestConfig.from_base(
+#       base_config,
+#       test_name="trt_variant",
+#       global_config={"backend": "trt"}  # This merges/overrides into base
+#   )
+#
+# Example 3 - Create variation with additional nested config:
+#   perf_config = DisaggregatedTestConfig.from_base(
+#       base_config,
+#       test_name="perf_variant",
+#       ctx_config={"return_perf_metrics": True},  # Merges with base ctx_config
+#       extra_validation="perf_metrics"
+#   )
+
+# Store some base configs for reuse
+_tiny_llama_cfg = DisaggregatedTestConfig(
+    test_name="2_ranks",
+    model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    global_config={
+        "backend": "pytorch",
+        "kv_cache_config": {
+            "free_gpu_memory_fraction": 0.2,
+            "enable_partial_reuse": False
+        },
+        "disable_overlap_scheduler": True,
+        "cuda_graph_config": None,
+    })
+
+_tiny_llama_multi_gpus_cfg = DisaggregatedTestConfig.from_base(
+    _tiny_llama_cfg,
+    test_name="multi_gpus",
+    global_config={
+        "kv_cache_config": {
+            "free_gpu_memory_fraction": 0.2,
+            "enable_partial_reuse": False,
+            "enable_block_reuse": False,
+        },
+    },
+    ctx_config={
+        "max_batch_size": 1,
+        "max_num_tokens": 3000,
+        "max_seq_len": 4096,
+    },
+    gen_config={
+        "max_batch_size": 256,
+        "max_num_tokens": 4096,
+        "max_seq_len": 4096,
+    },
+    skip_device_count=4,
+)
+
+_ds_v3_lite_tp1_cfg = DisaggregatedTestConfig(
+    test_name="ds_v3_lite_tp1",
+    model_root="DeepSeek-V3-Lite/fp8",
+    global_config={
+        "backend": "pytorch",
+        "free_gpu_memory_fraction": 0.1,
+    },
+    ctx_config={
+        "disable_overlap_scheduler": True,
+    },
+    gen_config={
+        "disable_overlap_scheduler": False,
+    },
+    skip_hopper=True,
+)
+
+_ds_v3_lite_4_gpus_cfg = DisaggregatedTestConfig(
+    test_name="ds_v3_lite",
+    model_root="DeepSeek-V3-Lite/fp8",
+    global_config={
+        "backend": "pytorch",
+        "free_gpu_memory_fraction": 0.7,
+    },
+    ctx_config={
+        "tensor_parallel_size": 2,
+        "disable_overlap_scheduler": True,
+    },
+    gen_config={
+        "tensor_parallel_size": 2,
+        "disable_overlap_scheduler": False,
+    },
+    skip_device_count=4,
+    skip_hopper=True,
+)
+
+TEST_CONFIGS = [
+    # TinyLlama tests - basic
+    _tiny_llama_cfg,
+    # Performance metrics variant - extends base with metrics config
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="perf_metrics",
+        global_config={"perf_metrics_max_requests": 1000},
+        ctx_config={
+            "return_perf_metrics": True,
+            "perf_metrics_max_requests": 1000
+        },
+        gen_config={
+            "return_perf_metrics": True,
+            "perf_metrics_max_requests": 1000
+        },
+        extra_validation="perf_metrics",
+    ),
+    # KV cache time variant - same as perf_metrics but different validation
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="kv_cache_time_output",
+        global_config={"perf_metrics_max_requests": 1000},
+        ctx_config={
+            "return_perf_metrics": True,
+            "perf_metrics_max_requests": 1000
+        },
+        gen_config={
+            "return_perf_metrics": True,
+            "perf_metrics_max_requests": 1000
+        },
+        extra_validation="kv_cache_time",
+    ),
+    # Create TRT variant from base - only need to override backend
+    DisaggregatedTestConfig(test_name="trt_backend",
+                            model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+                            global_config={
+                                "backend": "trt",
+                                "kv_cache_config": {
+                                    "free_gpu_memory_fraction": 0.2,
+                                    "enable_partial_reuse": False
+                                },
+                            }),
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="diff_max_tokens",
+        prompt_file="long_prompts.json",
+        ctx_config={
+            "max_num_tokens": 512,
+            "max_batch_size": 64
+        },
+        gen_config={
+            "max_num_tokens": 256,
+            "max_batch_size": 32
+        },
+    ),
+
+    # TinyLlama - CUDA graph
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="cuda_graph",
+        ctx_config={"cuda_graph_config": {
+            "batch_sizes": [1, 3000]
+        }},
+        gen_config={
+            "cuda_graph_config": {
+                "enable_padding": True,
+                "batch_sizes": [1, 4, 8, 16, 24, 32]
+            },
+            "max_batch_size": 256,
+            "max_num_tokens": 4096,
+            "max_seq_len": 4096,
+        },
+    ),
+
+    # TinyLlama - overlap
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="overlap",
+        ctx_config={
+            "max_num_tokens": 3000,
+            "max_seq_len": 4096,
+            "disable_overlap_scheduler": True,
+        },
+        gen_config={
+            "max_batch_size": 256,
+            "max_num_tokens": 4096,
+            "max_seq_len": 4096,
+            "disable_overlap_scheduler": False,
+        },
+    ),
+
+    # TinyLlama - mixed
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="mixed",
+        gen_config={"num_instances": 2},
+    ),
+
+    # TinyLlama - trtllm sampler
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="trtllm_sampler",
+        ctx_config={
+            "max_batch_size": 1,
+            "max_num_tokens": 3000,
+            "max_seq_len": 4096,
+            "sampler_type": "TRTLLMSampler",
+            "disable_overlap_scheduler": True,
+        },
+        gen_config={
+            "max_batch_size": 256,
+            "max_num_tokens": 4096,
+            "max_seq_len": 4096,
+            "sampler_type": "TRTLLMSampler",
+            "disable_overlap_scheduler": False,
+        },
+    ),
+
+    # TinyLlama - load balance
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="load_balance",
+        global_config={
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.15,
+                "enable_partial_reuse": False
+            },
+        },
+        ctx_config={
+            "num_instances": 2,
+            "router": {
+                "type": "load_balancing",
+                "use_tokens": True
+            },
+            "max_num_tokens": 3000,
+            "max_seq_len": 4096,
+            "disable_overlap_scheduler": True,
+        },
+        gen_config={
+            "num_instances": 2,
+            "router": {
+                "type": "load_balancing",
+                "use_tokens": False
+            },
+            "max_batch_size": 256,
+            "max_num_tokens": 4096,
+            "max_seq_len": 4096,
+            "disable_overlap_scheduler": False,
+        },
+    ),
+
+    # TinyLlama - cache aware balance
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="cache_aware_balance",
+        global_config={
+            "free_gpu_memory_fraction": 0.1,
+            "enable_autotuner": False,
+            "kv_cache_config": {
+                "enable_block_reuse": True,
+                "enable_partial_reuse": False,
+                "event_buffer_max_size": 1024,
+                "free_gpu_memory_fraction": 0.1
+            },
+        },
+        ctx_config={
+            "num_instances": 2,
+            "router": {
+                "type": "kv_cache_aware"
+            },
+            "max_batch_size": 16,
+            "max_num_tokens": 3000,
+            "max_seq_len": 4096,
+        },
+        gen_config={
+            "num_instances": 2,
+            "router": {
+                "type": "kv_cache_aware"
+            },
+            "max_batch_size": 256,
+            "max_num_tokens": 4096,
+            "max_seq_len": 4096,
+        },
+    ),
+
+    # TinyLlama - conditional
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="conditional",
+        model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        global_config={
+            "free_gpu_memory_fraction": 0.15,
+            "conditional_disagg_config": {
+                "max_local_prefill_length": 100
+            },
+            "enable_autotuner": False,
+            "kv_cache_config": {
+                "enable_block_reuse": True,
+                "enable_partial_reuse": True,
+                "event_buffer_max_size": 1024,
+                "free_gpu_memory_fraction": 0.15
+            },
+        },
+        gen_config={
+            "router": {
+                "type": "kv_cache_aware"
+            },
+        },
+    ),
+
+    # TinyLlama - ngram
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_cfg,
+        test_name="ngram",
+        global_config={
+            "free_gpu_memory_fraction": 0.1,
+        },
+        gen_config={
+            "speculative_config": {
+                "decoding_type": "NGram",
+                "max_draft_len": 4,
+                "max_matching_ngram_size": 4,
+                "is_keep_all": True,
+                "is_use_oldest": True,
+                "is_public_pool": True
+            },
+        },
+    ),
+    DisaggregatedTestConfig(
+        test_name="gen_only_bs1",
+        model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        env_vars={"TRTLLM_DISAGG_BENCHMARK_GEN_ONLY": "1"},
+        global_config={
+            "backend": "pytorch",
+            "cuda_graph_config": None,
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.2,
+                "enable_partial_reuse": False,
+            },
+            "enable_attention_dp": True,
+        },
+        ctx_config={
+            "tensor_parallel_size": 2,
+            "max_batch_size": 1,
+            "max_num_tokens": 3000,
+            "max_seq_len": 4096,
+            "disable_overlap_scheduler": True,
+        },
+        gen_config={
+            "tensor_parallel_size": 2,
+            "max_batch_size": 1,
+            "max_num_tokens": 4096,
+            "max_seq_len": 4096,
+            "disable_overlap_scheduler": False,
+        },
+        skip_device_count=4,
+    ),
+
+    # TinyLlama - TP variations
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_multi_gpus_cfg,
+        test_name="1ctx_tp2pp1_2gen_tp1pp1",
+        ctx_config={
+            "tensor_parallel_size": 2,
+        },
+        gen_config={
+            "num_instances": 2,
+        },
+        skip_device_count=4,
+    ),
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_multi_gpus_cfg,
+        test_name="1ctx_tp2pp1_2gen_tp1pp1_trt",
+        global_config={
+            "backend": "trt",
+        },
+        ctx_config={
+            "tensor_parallel_size": 2,
+        },
+        gen_config={
+            "num_instances": 2,
+        },
+        skip_device_count=4,
+    ),
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_multi_gpus_cfg,
+        test_name="1ctx_tp1pp2_1gen_tp1pp2",
+        ctx_config={
+            "pipeline_parallel_size": 2,
+        },
+        gen_config={
+            "pipeline_parallel_size": 2,
+        },
+        skip_device_count=4,
+    ),
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_multi_gpus_cfg,
+        test_name="1ctx_tp2pp1_1gen_tp1pp2",
+        ctx_config={
+            "tensor_parallel_size": 2,
+        },
+        gen_config={
+            "pipeline_parallel_size": 2,
+        },
+        skip_device_count=4,
+    ),
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_multi_gpus_cfg,
+        test_name="1ctx_tp1pp2_1gen_tp2pp1",
+        ctx_config={
+            "pipeline_parallel_size": 2,
+        },
+        gen_config={
+            "tensor_parallel_size": 2,
+        },
+        skip_device_count=4,
+    ),
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_multi_gpus_cfg,
+        test_name="1ctx_tp2pp2_1gen_tp2pp2",
+        ctx_config={
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 2,
+        },
+        gen_config={
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 2,
+        },
+        skip_device_count=8,
+    ),
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_multi_gpus_cfg,
+        test_name="1ctx_tp1pp4_1gen_tp1pp4",
+        ctx_config={
+            "pipeline_parallel_size": 4,
+        },
+        gen_config={
+            "pipeline_parallel_size": 4,
+        },
+        skip_device_count=8,
+    ),
+    DisaggregatedTestConfig.from_base(
+        _tiny_llama_multi_gpus_cfg,
+        test_name="1ctx_tp1pp4_1gen_tp4pp1",
+        ctx_config={
+            "pipeline_parallel_size": 4,
+        },
+        gen_config={
+            "tensor_parallel_size": 4,
+        },
+        skip_device_count=8,
+    ),
+    # DeepSeek V3 Lite tests
+
+    # TP1 tests
+    _ds_v3_lite_tp1_cfg,
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_tp1_cfg,
+        test_name="ds_v3_lite_tp1_mtp",
+        global_config={
+            "speculative_config": {
+                "decoding_type": "MTP",
+                "num_nextn_predict_layers": 1
+            },
+        },
+        ctx_config={
+            "enable_attention_dp": True,
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_tp1_cfg,
+        test_name="ds_v3_lite_tp1_mtp_adp_overlap",
+        global_config={
+            "speculative_config": {
+                "decoding_type": "MTP",
+                "num_nextn_predict_layers": 1
+            },
+            "enable_attention_dp": True,
+        },
+        ctx_config={
+            "disable_overlap_scheduler": True,
+        },
+        gen_config={
+            "disable_overlap_scheduler": False,
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_tp1_cfg,
+        test_name="ds_v3_lite_tp1_mtp2",
+        global_config={
+            "speculative_config": {
+                "decoding_type": "MTP",
+                "num_nextn_predict_layers": 2
+            },
+        },
+        ctx_config={
+            "enable_attention_dp": True,
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_tp1_cfg,
+        test_name="ds_v3_lite_tp1_cache_aware_balance",
+        global_config={
+            "enable_autotuner": False,
+            "kv_cache_config": {
+                "enable_block_reuse": True
+            }
+        },
+        ctx_config={
+            "num_instances": 2,
+            "router": {
+                "type": "kv_cache_aware"
+            },
+        },
+        gen_config={
+            "num_instances": 2,
+            "router": {
+                "type": "kv_cache_aware"
+            },
+        },
+        skip_hopper=True,
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_tp1_cfg,
+        test_name="ds_v3_lite_tp1_conditional",
+        global_config={
+            "enable_autotuner": False,
+            "conditional_disagg_config": {
+                "enable_conditional_generation": True
+            },
+            "kv_cache_config": {
+                "event_buffer_max_size": 1024,
+                "free_gpu_memory_fraction": 0.15,
+            },
+        },
+        ctx_config={
+            "router": {
+                "type": "kv_cache_aware"
+            },
+        },
+        gen_config={
+            "router": {
+                "type": "kv_cache_aware"
+            },
+        },
+    ),
+
+    # 4 ranks different backends
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_4_gpus_cfg,
+        test_name="ds_v3_lite_mpi",
+        global_config={
+            "cache_transceiver_config": {
+                "backend": "MPI",
+            },
+        },
+        skip_arm_arch=True,
+        env_vars={
+            "TRTLLM_USE_MPI_KVCACHE": "1",
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_4_gpus_cfg,
+        test_name="ds_v3_lite_ucx",
+        global_config={
+            "cache_transceiver_config": {
+                "backend": "UCX",
+            },
+        },
+        skip_arm_arch=True,
+        env_vars={
+            "TRTLLM_USE_UCX_KVCACHE": "1",
+            "UCX_TLS": "^ib"
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_4_gpus_cfg,
+        test_name="ds_v3_lite_nixl",
+        global_config={
+            "cache_transceiver_config": {
+                "backend": "NIXL",
+            },
+        },
+        skip_arm_arch=True,
+        env_vars={
+            "TRTLLM_USE_NIXL_KVCACHE": "1",
+            "UCX_TLS": "^ib"
+        },
+    ),
+    # 4 ranks
+    _ds_v3_lite_4_gpus_cfg,
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_4_gpus_cfg,
+        test_name="ds_v3_lite_adp",
+        global_config={
+            "enable_attention_dp": True,
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_4_gpus_cfg,
+        test_name="ds_v3_lite_adp_overlap",
+        global_config={
+            "enable_attention_dp": True,
+        },
+        ctx_config={
+            "disable_overlap_scheduler": True,
+        },
+        gen_config={
+            "disable_overlap_scheduler": False,
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_4_gpus_cfg,
+        test_name="ds_v3_lite_adp_overlap_cuda_graph",
+        global_config={
+            "enable_attention_dp": True,
+        },
+        ctx_config={
+            "disable_overlap_scheduler": True,
+        },
+        gen_config={
+            "cuda_graph_config": {
+                "enable_padding": False
+            },
+            "disable_overlap_scheduler": False,
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_4_gpus_cfg,
+        test_name="ds_v3_lite_overlap_cuda_graph",
+        gen_config={
+            "cuda_graph_config": {
+                "enable_padding": False
+            },
+            "disable_overlap_scheduler": False,
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_4_gpus_cfg,
+        test_name="ds_v3_lite_adp_mtp",
+        global_config={
+            "speculative_config": {
+                "decoding_type": "MTP",
+                "num_nextn_predict_layers": 1
+            },
+            "enable_attention_dp": True,
+        },
+    ),
+    DisaggregatedTestConfig.from_base(
+        _ds_v3_lite_4_gpus_cfg,
+        test_name="ds_v3_lite_mtp",
+        global_config={
+            "speculative_config": {
+                "decoding_type": "MTP",
+                "num_nextn_predict_layers": 1
+            },
+        },
+    ),
+]
+
+
+def get_test_id(config: DisaggregatedTestConfig) -> str:
+    """Generate test ID from config."""
+    return config.test_name
+
+
+def apply_skip_marks(config: DisaggregatedTestConfig):
+    """Apply skip markers based on configuration."""
+    markers = []
+
+    if config.skip_device_count is not None:
+        markers.append(pytest.mark.skip_less_device(config.skip_device_count))
+
+    return markers
+
+
+def pytest_generate_tests(metafunc):
+    """Generate test cases dynamically based on TEST_CONFIGS."""
+    if "config" in metafunc.fixturenames:
+        configs = TEST_CONFIGS
+        [get_test_id(c) for c in configs]
+
+        # Apply marks
+        marked_configs = []
+        for config in configs:
+            marks = apply_skip_marks(config)
+            if marks:
+                marked_configs.append(
+                    pytest.param(config, marks=marks, id=get_test_id(config)))
+            else:
+                marked_configs.append(
+                    pytest.param(config, id=get_test_id(config)))
+
+        metafunc.parametrize("config", marked_configs)
+
+
+def run_disaggregated_test_parametrized(example_dir,
+                                        config: DisaggregatedTestConfig,
+                                        env=None,
+                                        cwd=None,
+                                        extra_endpoints_test=None):
+    """Run disaggregated test with parametrized configuration.
+
+    Args:
+        example_dir: Path to the examples directory
+        config: DisaggregatedTestConfig with all test parameters
+        env: Environment variables
+        cwd: Working directory for test execution
+        extra_endpoints_test: Optional callback for additional endpoint validation
+    """
+    import subprocess
+
+    from defs.trt_test_alternative import popen
+
+    from tensorrt_llm._utils import mpi_disabled
+    from tensorrt_llm.logger import logger
+
+    cleanup_output_files()
+    run_env = env.copy()
+    run_env["UCX_TLS"] = "^ib"
+
+    # Generate config file
+    config_path = config.generate_yaml_config(cwd)
+
+    # Print generated config for debugging
+    print(f"\n{'='*80}")
+    print(f"Generated YAML config for test: {config.test_name}")
+    print(f"Config file: {config_path}")
+    print(f"{'='*80}")
+    with open(config_path, 'r') as f:
+        print(f.read())
+    print(f"{'='*80}\n")
+
+    try:
+        num_ranks = config.get_num_ranks()
+        use_ray = mpi_disabled()
+
+        if not use_ray:
+            workers_cmd = [
+                'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
+                str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker',
+                '-c', config_path
+            ]
+        else:
+            pytest.skip(
+                "https://nvbugs/5584607 Ray orchestrator is not supported with NIXL(DEFAULT) cache transceiver backend."
+            )
+            # Check backend compatibility
+            backend = config.global_config.get('backend', 'pytorch')
+            if backend != "pytorch":
+                pytest.skip(
+                    "Ray orchestrator is only supported with pytorch backend.")
+
+            # Generate extra config files for Ray workers
+            def get_extra_llm_config(server_config, suffix):
+                extra_config = {
+                    'orchestrator_type': 'ray',
+                }
+                for key, value in server_config.items():
+                    if key not in ['num_instances', 'urls']:
+                        extra_config[key] = value
+                return extra_config
+
+            extra_config_files = []
+            workers_cmds = []
+
+            # Create config for context servers
+            ctx_num_instances = config.ctx_config.get('num_instances', 1)
+            for i in range(ctx_num_instances):
+                extra_llm_config = get_extra_llm_config(config.ctx_config,
+                                                        f'ctx_{i}')
+                extra_file = os.path.join(cwd,
+                                          f'{config.test_name}_ctx_{i}.yaml')
+                with open(extra_file, 'w') as f:
+                    yaml.dump(extra_llm_config, f, default_flow_style=False)
+                extra_config_files.append(extra_file)
+                workers_cmds.append([
+                    'trtllm-serve', 'disaggregated_ray_worker', '-c',
+                    extra_file, '--model', config.model_root
+                ])
+
+            # Create config for generation servers
+            gen_num_instances = config.gen_config.get('num_instances', 1)
+            for i in range(gen_num_instances):
+                extra_llm_config = get_extra_llm_config(config.gen_config,
+                                                        f'gen_{i}')
+                extra_file = os.path.join(cwd,
+                                          f'{config.test_name}_gen_{i}.yaml')
+                with open(extra_file, 'w') as f:
+                    yaml.dump(extra_llm_config, f, default_flow_style=False)
+                extra_config_files.append(extra_file)
+                workers_cmds.append([
+                    'trtllm-serve', 'disaggregated_ray_worker', '-c',
+                    extra_file, '--model', config.model_root
+                ])
+
+        server_start_timeout = 1200
+        server_cmd = [
+            'trtllm-serve', 'disaggregated', '--server_start_timeout',
+            str(server_start_timeout), '-c', config_path
+        ]
+        server_url = get_disagg_server_url_from_cfg(config_path)
+
+        try:
+            if not use_ray:
+                with (open('output_workers.log', 'w') as output_workers,
+                      popen(workers_cmd,
+                            stdout=output_workers,
+                            stderr=subprocess.STDOUT,
+                            env=run_env,
+                            cwd=cwd) as
+                      workers_proc, open('output_disagg.log',
+                                         'w') as output_disagg,
+                      popen(server_cmd,
+                            stdout=output_disagg,
+                            stderr=subprocess.STDOUT,
+                            env=run_env,
+                            cwd=cwd) as server_proc):
+                    run_client_tests(example_dir,
+                                     config_path,
+                                     config.test_name,
+                                     config.num_iters,
+                                     env,
+                                     server_start_timeout,
+                                     config.prompt_file,
+                                     extra_endpoints_test,
+                                     server_url,
+                                     workers_proc,
+                                     server_proc,
+                                     use_ray=False)
+            else:
+                # Ray orchestrator path
+                workers_proc = []
+                for worker_cmd in workers_cmds:
+                    workers_proc.append(
+                        popen(worker_cmd,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.STDOUT,
+                              env=run_env,
+                              cwd=cwd))
+
+                # Enter all worker contexts
+                for proc_cm in workers_proc:
+                    proc_cm.__enter__()
+
+                with (open('output_disagg.log', 'w') as output_disagg,
+                      popen(server_cmd,
+                            stdout=output_disagg,
+                            stderr=subprocess.STDOUT,
+                            env=run_env,
+                            cwd=cwd) as server_proc):
+                    run_client_tests(example_dir,
+                                     config_path,
+                                     config.test_name,
+                                     config.num_iters,
+                                     env,
+                                     server_start_timeout,
+                                     config.prompt_file,
+                                     extra_endpoints_test,
+                                     server_url,
+                                     workers_proc,
+                                     server_proc,
+                                     use_ray=True)
+        except Exception:
+            logger.error("-------- Workers output --------")
+            if not use_ray and os.path.exists('output_workers.log'):
+                with open('output_workers.log', 'r') as f:
+                    logger.error(f.read())
+
+            logger.error("-------- Disagg server output --------")
+            if os.path.exists('output_disagg.log'):
+                with open('output_disagg.log', 'r') as f:
+                    logger.error(f.read())
+            raise
+        finally:
+            if use_ray:
+                subprocess.run(['ray', 'stop', '--force'], check=False)
+                for extra_file in extra_config_files:
+                    if os.path.exists(extra_file):
+                        os.remove(extra_file)
+            else:
+                if 'server_proc' in locals() and 'workers_proc' in locals():
+                    server_proc.terminate()
+                    workers_proc.terminate()
+                    server_proc.wait()
+                    workers_proc.wait()
+    finally:
+        # Cleanup generated config
+        if os.path.exists(config_path):
+            os.remove(config_path)
+
+
+@pytest.fixture(scope="function")
+def model_root_fixture(config, llm_venv, request):
+    """Fixture that provides the correct model root based on config."""
+    from defs.conftest import llm_models_root
+
+    models_root = llm_models_root()
+
+    print("Running model root fixture for config: ", config.test_name)
+    if config.model_root == "TinyLlama/TinyLlama-1.1B-Chat-v1.0":
+        src_root = os.path.join(models_root, "llama-models-v2",
+                                "TinyLlama-1.1B-Chat-v1.0")
+    else:
+        src_root = os.path.join(models_root, config.model_root)
+
+    dst_root = f"{llm_venv.get_working_directory()}/{config.model_root}"
+
+    # Create symlink
+    if not os.path.exists(dst_root) and not os.path.islink(dst_root):
+        os.makedirs(os.path.dirname(dst_root), exist_ok=True)
+        os.symlink(src_root, dst_root, target_is_directory=True)
+
+    return src_root
+
+
+def test_disagg(
+    config: DisaggregatedTestConfig,
+    disaggregated_test_root,
+    disaggregated_example_root,
+    llm_venv,
+    model_root_fixture,
+):
+    """Parametrized test for all disaggregated configurations."""
+    # Apply skip conditions that can't be marks
+    if config.skip_hopper:
+        skip_no_hopper()
+
+    if config.skip_arm_arch:
+        skip_arm()
+
+    # Setup environment
+    env = llm_venv._new_env.copy()
+
+    # Handle special validation cases
+    extra_endpoints_test = None
+    kv_cache_output_path = None
+
+    if config.extra_validation == "perf_metrics":
+        # Test /perf_metrics endpoint
+        def extra_endpoints_test(server_url: str):
+            import json
+            import urllib.request
+
+            with urllib.request.urlopen(f"{server_url}/perf_metrics",
+                                        timeout=10) as resp:
+                assert resp.status == 200
+                perf_metrics = json.load(resp)
+            assert len(perf_metrics) > 0
+            item = perf_metrics[0]
+
+            # Use helper function to validate all timing metrics comprehensively
+            validate_timing_metrics(item, "perf_metrics test")
+
+    elif config.extra_validation == "kv_cache_time":
+        # Test KV cache time output files
+        kv_cache_output_path = os.path.join(llm_venv.get_working_directory(),
+                                            "cache_time")
+        env["TRTLLM_KVCACHE_TIME_OUTPUT_PATH"] = kv_cache_output_path
+
+    # Apply test-specific environment variables
+    if config.env_vars:
+        env.update(config.env_vars)
+
+    # Run the test
+    run_disaggregated_test_parametrized(
+        disaggregated_example_root,
+        config,
+        env=env,
+        cwd=llm_venv.get_working_directory(),
+        extra_endpoints_test=extra_endpoints_test)
+
+    # Post-test validation for kv_cache_time
+    if config.extra_validation == "kv_cache_time":
+        assert os.path.isdir(kv_cache_output_path)
+        send_file = os.path.join(kv_cache_output_path, "rank_0_send.csv")
+        recv_file = os.path.join(kv_cache_output_path, "rank_1_recv.csv")
+        assert os.path.exists(send_file)
+        assert os.path.exists(recv_file)
+        with open(send_file, "r") as f:
+            lines = f.readlines()
+            assert len(lines) > 1
+            assert lines[0].startswith(
+                "RequestID,RequestInfo,Preparation,Preprocess,Transmissions,Postprocess"
+            )
+            assert ",Delay,Duration,Bandwidth(Gbps)" in lines[0]
+            # get a send sample and match the recv
+            sample = lines[1].split(',')
+            assert len(sample) >= 9
+        with open(recv_file, "r") as f:
+            lines = f.readlines()
+            assert len(lines) > 1
+            matched = False
+            for line in lines:
+                sample_recv = line.split(',')
+                if sample_recv[0] == sample[0]:
+                    matched = True
+                    break
+            assert matched
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
index 22bffa4c426..dce78c4e65c 100644
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -763,26 +763,25 @@ examples/serve/test_serve_negative.py::test_extremely_large_batch
 
 
 # PyTorch flow disaggregated tests
-disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[2_ranks]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[perf_metrics]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[kv_cache_time_output]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[trt_backend]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[cuda_graph]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[load_balance]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[cache_aware_balance]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[trtllm_sampler]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mpi]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mtp]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp2_1gen_tp2pp2]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp4_1gen_tp1pp4]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
-disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_kv_cache_time_output[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
index e6eb445bf88..38c1740f180 100644
--- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
@@ -194,20 +194,17 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
 accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
-disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[2_ranks]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[cache_aware_balance]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[load_balance]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[cuda_graph]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[trt_backend]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[trtllm_sampler]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp_mtp]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mpi]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl]
+disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx]
 disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
index 5fc56bd938f..3aa78d5e9d2 100644
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -28,17 +28,18 @@ l0_a10:
   - unittest/disaggregated/test_remoteDictionary.py
   - unittest/disaggregated/test_disagg_cluster_manager_worker.py
   - unittest/disaggregated/test_cluster_storage.py
-  - disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_kv_cache_time_output[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_perf_metrics[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_conditional[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_ngram[TinyLlama-1.1B-Chat-v1.0]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[2_ranks]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[trt_backend]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[cuda_graph]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[mixed]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[overlap]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[diff_max_tokens]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[kv_cache_time_output]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[perf_metrics]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[cache_aware_balance]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[conditional]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ngram]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[load_balance]
   - disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 0e216d4acce..2b35fe5e75e 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -41,7 +41,8 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto]
@@ -49,9 +50,8 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
   - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
   - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
@@ -73,10 +73,10 @@ l0_dgx_b200:
     - unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu4"
     - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False]
     - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=False-attn_backend=TRTLLM-torch_compile=False]
-    - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0]
-    - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2_genpp2[TinyLlama-1.1B-Chat-v1.0]
-    - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_gentp2[TinyLlama-1.1B-Chat-v1.0]
-    - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_gentp4[TinyLlama-1.1B-Chat-v1.0]
+    - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp2_1gen_tp1pp2]
+    - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp1_1gen_tp1pp2]
+    - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp2_1gen_tp2pp1]
+    - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp4_1gen_tp4pp1]
     - examples/test_ray.py::test_llm_inference_distributed_ray[tp2pp2]
     - examples/test_ray.py::test_ray_disaggregated_serving[tp2]
 - condition:
@@ -186,4 +186,4 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300.yml b/tests/integration/test_lists/test-db/l0_dgx_b300.yml
index 82f78afbf98..b405261ce3a 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b300.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b300.yml
@@ -59,18 +59,18 @@ l0_dgx_b300:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True]
   - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
   - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] TIMEOUT (180)
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index 1518613c1d0..730427e8890 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -74,13 +74,13 @@ l0_dgx_h100:
   - test_e2e.py::test_ptp_quickstart_advanced_bs1
   - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8]
   # ------------- Disaggregated serving tests ---------------
-  - disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2_genpp2[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_gentp2[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_gentp4[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp1_2gen_tp1pp1]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp1_2gen_tp1pp1_trt]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp2_1gen_tp1pp2]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp1_1gen_tp1pp2]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp2_1gen_tp2pp1]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp4_1gen_tp4pp1]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[gen_only_bs1]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
@@ -144,20 +144,19 @@ l0_dgx_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=2]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mpi]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp_overlap]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mtp]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp_mtp]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp_overlap_cuda_graph]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_overlap_cuda_graph]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_mtp_adp_overlap]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_cache_aware_balance]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_conditional]
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
 - condition:
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml
index 05935956a2b..d798b354761 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml
@@ -31,8 +31,8 @@ l0_dgx_h200:
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
   - accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
   - accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
-  - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
-  - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp2_1gen_tp2pp2]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp4_1gen_tp1pp4]
   - unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora
 - condition:
     ranges:
@@ -119,10 +119,10 @@ l0_dgx_h200:
   - test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b]
   - test_e2e.py::test_trtllm_bench_mgmn
   - unittest/_torch/multi_gpu -m "post_merge" TIMEOUT (90)
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
+  - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
   # ------------- AutoDeploy tests ---------------
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
 - condition:
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 3b7d94d38c3..d0ba7f1fbd9 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -72,11 +72,10 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=2]
   - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
   - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_mtp]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_mtp2]
+  - disaggregated/test_disaggregated_parametrized.py::test_disagg[load_balance]
   - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[False-False-DeepSeek-V3-Lite-fp8/fp8]
   - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[False-True-DeepSeek-V3-Lite-fp8/fp8]
   - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-False-DeepSeek-V3-Lite-fp8/fp8]