diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml deleted file mode 100644 index d64bac8763b..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml +++ /dev/null @@ -1,46 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.1 -disable_overlap_scheduler: True -enable_autotuner: False -context_servers: - num_instances: 2 - router: - type: kv_cache_aware - max_batch_size: 16 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: False - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" - - "localhost:8002" -generation_servers: - num_instances: 2 - router: - type: kv_cache_aware - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: False - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.1 - urls: - - "localhost:8003" - - "localhost:8004" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml deleted file mode 100644 index fe15f70085c..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml +++ /dev/null @@ -1,39 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/bf16 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -enable_autotuner: False -context_servers: - num_instances: 2 - router: - type: kv_cache_aware - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.1 - cache_transceiver_config: - backend: "DEFAULT" - urls: - - "localhost:8001" - - "localhost:8002" -generation_servers: - num_instances: 2 - router: - type: kv_cache_aware - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.1 - cache_transceiver_config: - backend: "DEFAULT" - urls: - - "localhost:8003" - - "localhost:8004" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml deleted file mode 100644 index 26444b1ab23..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml +++ /dev/null @@ -1,35 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -enable_autotuner: False -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.15 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - router: - type: kv_cache_aware - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.05 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml deleted file mode 100644 index 06a4c154b46..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml +++ /dev/null @@ -1,35 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/bf16 -free_gpu_memory_fraction: 0.15 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -enable_autotuner: False -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - router: - type: kv_cache_aware - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.05 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml deleted file mode 100644 index 28816380fe4..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml +++ /dev/null @@ -1,38 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.15 -conditional_disagg_config: - max_local_prefill_length: 100 -disable_overlap_scheduler: True -enable_autotuner: False -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.15 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - router: - type: kv_cache_aware - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.15 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml deleted file mode 100644 index b7f34202724..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml +++ /dev/null @@ -1,38 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/bf16 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.15 -conditional_disagg_config: - max_local_prefill_length: 100 -disable_overlap_scheduler: True -enable_autotuner: False -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.15 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - router: - type: kv_cache_aware - kv_cache_config: - enable_block_reuse: True - enable_partial_reuse: True - event_buffer_max_size: 1024 - free_gpu_memory_fraction: 0.15 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml deleted file mode 100644 index b7f03c0f9f5..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml +++ /dev/null @@ -1,38 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 2 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - enable_block_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 2 - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - enable_block_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml deleted file mode 100644 index 892b4e8b31f..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml +++ /dev/null @@ -1,37 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 2 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - enable_block_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml deleted file mode 100644 index 2c7a67e1cbf..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml +++ /dev/null @@ -1,38 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 4 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - enable_block_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 4 - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - enable_block_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_gentp4.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_gentp4.yaml deleted file mode 100644 index a1e4ad50a9c..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_gentp4.yaml +++ /dev/null @@ -1,36 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 4 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 4 - pipeline_parallel_size: 1 - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml deleted file mode 100644 index 83f9b3a3e87..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml +++ /dev/null @@ -1,23 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.1 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml deleted file mode 100644 index 57eb4ea0041..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml +++ /dev/null @@ -1,28 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.1 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - enable_attention_dp: true - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - enable_attention_dp: false - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml deleted file mode 100644 index 4343850c77f..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml +++ /dev/null @@ -1,29 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.1 -backend: "pytorch" -cuda_graph_config: null -speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - enable_attention_dp: true - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - enable_attention_dp: true - disable_overlap_scheduler: False - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_ctxpp2_gentp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_ctxpp2_gentp2.yaml deleted file mode 100644 index 4a61497e94e..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_ctxpp2_gentp2.yaml +++ /dev/null @@ -1,32 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.1 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True - -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 2 - enable_attention_dp: false - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: false - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml deleted file mode 100644 index 837e5df8e33..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml +++ /dev/null @@ -1,28 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.1 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 2 -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - enable_attention_dp: true - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - enable_attention_dp: false - urls: - - "localhost:8002" - cache_transceiver_config: - backend: DEFAULT diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml deleted file mode 100644 index ce53fd4626b..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml +++ /dev/null @@ -1,36 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 2 - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml deleted file mode 100644 index 1335d63adfe..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml +++ /dev/null @@ -1,24 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 2 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" - - "localhost:8003" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml deleted file mode 100644 index fa5dffa518b..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml +++ /dev/null @@ -1,22 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -free_gpu_memory_fraction: 0.25 -backend: "trt" -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 2 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" - - "localhost:8003" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml deleted file mode 100644 index 6b22665e9f1..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml +++ /dev/null @@ -1,23 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml deleted file mode 100644 index 80a1a3636a8..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml +++ /dev/null @@ -1,25 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml deleted file mode 100644 index 9dfb092151a..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml +++ /dev/null @@ -1,25 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: true - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: false - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml deleted file mode 100644 index 4b6bc571dab..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml +++ /dev/null @@ -1,29 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: true - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: false - cache_transceiver_config: - backend: DEFAULT - - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml deleted file mode 100644 index 26218586f49..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml +++ /dev/null @@ -1,26 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: True - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: True - disable_overlap_scheduler: False - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml deleted file mode 100644 index 99034f8a1a3..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml +++ /dev/null @@ -1,27 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: true - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: true - cuda_graph_config: - enable_padding: False - disable_overlap_scheduler: False - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml deleted file mode 100644 index 4cfe18ebaf6..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml +++ /dev/null @@ -1,22 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: "MPI" - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: "MPI" - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml deleted file mode 100644 index 3b1aa8fc0e3..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml +++ /dev/null @@ -1,22 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: "NIXL" - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: "NIXL" - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml deleted file mode 100644 index 4c601fbb868..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml +++ /dev/null @@ -1,25 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cuda_graph_config: - enable_padding: False - disable_overlap_scheduler: False - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml deleted file mode 100644 index d3395938cae..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml +++ /dev/null @@ -1,22 +0,0 @@ -hostname: localhost -port: 8000 -model: DeepSeek-V3-Lite/fp8 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: "UCX" - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: "UCX" - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml deleted file mode 100644 index ce47009aaad..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml +++ /dev/null @@ -1,38 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 2 - pipeline_parallel_size: 2 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - enable_block_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 2 - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - enable_block_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml deleted file mode 100644 index 56db3df7697..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml +++ /dev/null @@ -1,39 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - cuda_graph_config: - batch_sizes: [1,3000] - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - cuda_graph_config: - enable_padding: True - batch_sizes: [1,4,8,16,24,32] - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml deleted file mode 100644 index 26d1f6b6c15..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml +++ /dev/null @@ -1,23 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - max_num_tokens: 512 - max_batch_size: 64 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - max_num_tokens: 256 - max_batch_size: 32 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml deleted file mode 100644 index 92b13837644..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml +++ /dev/null @@ -1,21 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -backend: "pytorch" -cuda_graph_config: null -context_servers: - num_instances: 0 -generation_servers: - num_instances: 2 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_block_reuse: False - enable_partial_reuse: False - cache_transceiver_config: - backend: DEFAULT - print_iter_log: True - urls: - - "localhost:8002" - - "localhost:8003" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml deleted file mode 100644 index 19d1eca714f..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml +++ /dev/null @@ -1,37 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: true - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - enable_attention_dp: true - max_batch_size: 1 - max_num_tokens: 4096 - max_seq_len: 4096 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml deleted file mode 100644 index ad706f8bf1f..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml +++ /dev/null @@ -1,19 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -backend: "trt" -context_servers: - num_instances: 0 -generation_servers: - num_instances: 2 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_block_reuse: False - enable_partial_reuse: False - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" - - "localhost:8003" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml deleted file mode 100644 index f0593d9ef60..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml +++ /dev/null @@ -1,44 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.15 -context_servers: - num_instances: 2 - router: - type: load_balancing - use_tokens: True - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - free_gpu_memory_fraction: 0.15 - enable_partial_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" - - "localhost:8002" -generation_servers: - num_instances: 2 - router: - type: load_balancing - use_tokens: False - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - free_gpu_memory_fraction: 0.15 - enable_partial_reuse: False - disable_overlap_scheduler: False - cache_transceiver_config: - backend: "DEFAULT" - urls: - - "localhost:8003" - - "localhost:8004" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_metrics.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_metrics.yaml deleted file mode 100644 index 6d566aa4f99..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_metrics.yaml +++ /dev/null @@ -1,28 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -perf_metrics_max_requests: 1000 -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - return_perf_metrics: True - perf_metrics_max_requests: 1000 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - return_perf_metrics: True - perf_metrics_max_requests: 1000 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml deleted file mode 100644 index 27d7ec4ee82..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml +++ /dev/null @@ -1,24 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -free_gpu_memory_fraction: 0.25 -backend: "pytorch" -cuda_graph_config: null -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 2 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml deleted file mode 100644 index 4e3417c732a..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml +++ /dev/null @@ -1,29 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -free_gpu_memory_fraction: 0.1 -backend: pytorch -disable_overlap_scheduler: True -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: "DEFAULT" - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: "DEFAULT" - urls: - - "localhost:8002" - speculative_config: - decoding_type: NGram - max_draft_len: 4 - max_matching_ngram_size: 4 - is_keep_all: True - is_use_oldest: True - is_public_pool: True diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml deleted file mode 100644 index 55990bbaa62..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml +++ /dev/null @@ -1,36 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - disable_overlap_scheduler: True - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - disable_overlap_scheduler: False - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml deleted file mode 100644 index 3eb275c87e0..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml +++ /dev/null @@ -1,23 +0,0 @@ -hostname: localhost -port: 8000 -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -free_gpu_memory_fraction: 0.25 -backend: "trt" -context_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - kv_cache_config: - free_gpu_memory_fraction: 0.2 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - cache_transceiver_config: - backend: DEFAULT - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml deleted file mode 100644 index 287d1103a4f..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml +++ /dev/null @@ -1,38 +0,0 @@ -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -hostname: localhost -port: 8000 -backend: "pytorch" -cuda_graph_config: null -free_gpu_memory_fraction: 0.2 -context_servers: - num_instances: 1 - max_batch_size: 1 - max_num_tokens: 3000 - max_seq_len: 4096 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - sampler_type: "TRTLLMSampler" - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - cache_transceiver_config: - backend: "DEFAULT" - disable_overlap_scheduler: True - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - max_batch_size: 256 - max_num_tokens: 4096 - max_seq_len: 4096 - sampler_type: "TRTLLMSampler" - kv_cache_config: - free_gpu_memory_fraction: 0.2 - enable_partial_reuse: False - cache_transceiver_config: - backend: "DEFAULT" - disable_overlap_scheduler: False - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py deleted file mode 100644 index 720da1acbdc..00000000000 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ /dev/null @@ -1,1720 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import contextlib -import os -import re -import subprocess -import tempfile -from typing import Callable - -import pytest -import yaml -from defs.common import wait_for_server -from defs.conftest import (get_sm_version, llm_models_root, skip_arm, - skip_no_hopper) -from defs.trt_test_alternative import check_call, check_output, popen - -from tensorrt_llm._utils import mpi_disabled -from tensorrt_llm.logger import logger - - -def cleanup_output_files(): - """Clean up output files from previous runs.""" - for file in ['output.json', 'output_streaming.json']: - try: - os.remove(file) - except FileNotFoundError: - pass - - -def validate_timing_metrics(perf_metrics_item, request_context=""): - """ - Helper function to validate timing metrics relationships. - - Args: - perf_metrics_item: A single performance metrics item from the /perf_metrics endpoint - request_context: String context for error messages (e.g., "request 1", "streaming") - """ - # Validate basic structure - required_keys = [ - "ctx_server", "gen_server", "ctx_perf_metrics", "gen_perf_metrics", - "disagg_server_arrival_time", "disagg_server_first_token_time" - ] - for key in required_keys: - assert key in perf_metrics_item, f"Missing key: {key} in {request_context}" - - assert perf_metrics_item["ctx_perf_metrics"][ - "ctx_request_id"] == perf_metrics_item["gen_perf_metrics"][ - "ctx_request_id"] - - # Extract timing metrics - ctx_metrics = perf_metrics_item["ctx_perf_metrics"]["perf_metrics"][ - "timing_metrics"] - gen_metrics = perf_metrics_item["gen_perf_metrics"]["perf_metrics"][ - "timing_metrics"] - disagg_arrival = perf_metrics_item["disagg_server_arrival_time"] - disagg_first_token = perf_metrics_item["disagg_server_first_token_time"] - - # Validate disaggregated server timing metrics - assert disagg_arrival is not None, f"disagg_server_arrival_time is None in {request_context}" - assert disagg_first_token is not None, f"disagg_server_first_token_time is None in {request_context}" - assert isinstance( - disagg_arrival, - (int, float - )), f"disagg_server_arrival_time is not numeric in {request_context}" - assert isinstance( - disagg_first_token, (int, float) - ), f"disagg_server_first_token_time is not numeric in {request_context}" - assert disagg_arrival > 0, f"disagg_server_arrival_time is not positive in {request_context}" - assert disagg_first_token > 0, f"disagg_server_first_token_time is not positive in {request_context}" - assert disagg_arrival <= disagg_first_token, f"disagg_server_arrival_time > disagg_server_first_token_time in {request_context}" - - # Validate server-level timing metrics for context server - ctx_server_arrival = ctx_metrics.get("server_arrival_time") - ctx_server_first_token = ctx_metrics.get("server_first_token_time") - assert ctx_server_arrival is not None, f"ctx server_arrival_time is None in {request_context}" - assert ctx_server_first_token is not None, f"ctx server_first_token_time is None in {request_context}" - assert isinstance( - ctx_server_arrival, - (int, - float)), f"ctx server_arrival_time is not numeric in {request_context}" - assert isinstance( - ctx_server_first_token, - (int, float - )), f"ctx server_first_token_time is not numeric in {request_context}" - assert ctx_server_arrival <= ctx_server_first_token, f"ctx server_arrival_time > server_first_token_time in {request_context}" - assert ctx_metrics["last_token_time"] - ctx_server_first_token < 1e-3 - - # Validate server-level timing metrics for generation server - gen_server_arrival = gen_metrics.get("server_arrival_time") - gen_server_first_token = gen_metrics.get("server_first_token_time") - assert gen_server_arrival is not None, f"gen server_arrival_time is None in {request_context}" - assert gen_server_first_token is not None, f"gen server_first_token_time is None in {request_context}" - assert isinstance( - gen_server_arrival, - (int, - float)), f"gen server_arrival_time is not numeric in {request_context}" - assert isinstance( - gen_server_first_token, - (int, float - )), f"gen server_first_token_time is not numeric in {request_context}" - assert gen_server_arrival <= gen_server_first_token, f"gen server_arrival_time > server_first_token_time in {request_context}" - - # Validate timing relationships between different levels - # Disaggregated server should receive request before individual servers - assert disagg_arrival <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}" - assert disagg_arrival <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}" - - # Context should complete before generation starts - assert ctx_server_first_token <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}" - - # Validate internal timing consistency - ctx_arrival_time = ctx_metrics["arrival_time"] - ctx_first_token_time = ctx_metrics["first_token_time"] - gen_arrival_time = gen_metrics["arrival_time"] - gen_first_token_time = gen_metrics["first_token_time"] - - assert ctx_arrival_time <= ctx_first_token_time, f"ctx arrival_time > first_token_time in {request_context}" - assert gen_arrival_time <= gen_first_token_time, f"gen arrival_time > first_token_time in {request_context}" - - # Test KV cache transfer timing (if present) - if "kv_cache_transfer_start" in gen_metrics and "kv_cache_transfer_end" in gen_metrics: - kv_start = gen_metrics["kv_cache_transfer_start"] - kv_end = gen_metrics["kv_cache_transfer_end"] - assert gen_metrics["kv_cache_size"] > 0 - assert kv_start <= kv_end, f"kv_cache_transfer_start > kv_cache_transfer_end in {request_context}" - assert gen_arrival_time <= kv_start, f"gen_arrival_time > kv_cache_transfer_start in {request_context}" - assert kv_end <= gen_metrics[ - "first_scheduled_time"], f"kv_cache_transfer_end > first_scheduled_time in {request_context}" - - return True - - -def get_disagg_server_url_from_cfg(config_file: str) -> str: - with open(config_file, 'r') as file: - config = yaml.safe_load(file) - server_host = config.get('hostname', 'localhost') - server_port = config.get('port', 8000) - return f"http://{server_host}:{server_port}" - - -def get_test_config(test_desc, example_dir, test_root): - """Get test configuration based on test description.""" - test_configs_root = f"{test_root}/test_configs" - config_map = { - "2_ranks_diff_max_tokens": - (2, f"{test_configs_root}/disagg_config_diff_max_tokens.yaml"), - "2_ranks": (2, f"{example_dir}/disagg_config.yaml"), - "2_ranks_trt_backend": - (2, f"{test_configs_root}/disagg_config_trt_backend.yaml"), - "gen_only": (2, f"{test_configs_root}/disagg_config_gen_only.yaml"), - "gen_only_trt_backend": - (2, f"{test_configs_root}/disagg_config_gen_only_trt_backend.yaml"), - "gen_only_bs1": - (4, f"{test_configs_root}/disagg_config_gen_only_bs1.yaml"), - "4_ranks": (4, f"{test_configs_root}/disagg_config_ctxtp2_gentp1.yaml"), - "4_ranks_trt_backend": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp1_trt_backend.yaml"), - "cuda_graph": - (2, f"{test_configs_root}/disagg_config_cuda_graph_padding.yaml"), - "mixed": (2, f"{test_configs_root}/disagg_config_mixed.yaml"), - "overlap": (2, f"{test_configs_root}/disagg_config_overlap.yaml"), - "perf_metrics": (2, f"{test_configs_root}/disagg_config_metrics.yaml"), - "trtllm_sampler": - (2, f"{test_configs_root}/disagg_config_trtllm_sampler.yaml"), - "load_balance": - (4, f"{test_configs_root}/disagg_config_load_balance.yaml"), - "cache_aware_balance": - (4, f"{test_configs_root}/disagg_config_cache_aware_balance.yaml"), - "conditional": (2, - f"{test_configs_root}/disagg_config_conditional.yaml"), - "ngram": (2, f"{test_configs_root}/disagg_config_ngram.yaml"), - "ctxpp2_genpp2": - (4, f"{test_configs_root}/disagg_config_ctxpp2_genpp2.yaml"), - "ctxtp2_genpp2": - (4, f"{test_configs_root}/disagg_config_ctxtp2_genpp2.yaml"), - "ctxpp2_gentp2": - (4, f"{test_configs_root}/disagg_config_ctxpp2_gentp2.yaml"), - "ctxtp2pp2_gentp2pp2": - (8, f"{test_configs_root}/disagg_config_ctxtp2pp2_gentp2pp2.yaml"), - "ctxpp4_genpp4": - (8, f"{test_configs_root}/disagg_config_ctxpp4_genpp4.yaml"), - "ctxpp4_gentp4": - (8, f"{test_configs_root}/disagg_config_ctxpp4_gentp4.yaml"), - "deepseek_v3_lite_fp8_mpi": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml" - ), - "deepseek_v3_lite_fp8_ucx": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml" - ), - "deepseek_v3_lite_fp8_nixl": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml" - ), - "deepseek_v3_lite_fp8_tp1": - (2, - f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml" - ), - "deepseek_v3_lite_fp8_tp1_mtp": - (2, - f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml" - ), - "deepseek_v3_lite_fp_8_overlap_dp": - (2, - f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_overlap_dp.yaml" - ), - "deepseek_v3_lite_fp8_attention_dp": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml" - ), - "deepseek_v3_lite_fp_8_attention_dp_overlap": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml" - ), - "deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml" - ), - "deepseek_v3_lite_fp8_overlap_cuda_graph": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml" - ), - "deepseek_v3_lite_fp8_attention_dp_one": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml" - ), - "deepseek_v3_lite_fp8_attention_dp_one_mtp": - (4, - f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml" - ), - "deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp": - (2, - f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml" - ), - "deepseek_v3_lite_bf16_cache_aware_balance": - (4, - f"{test_configs_root}/disagg_config_cache_aware_balance_deepseek_v3.yaml" - ), - "deepseek_v3_lite_bf16_conditional": - (2, f"{test_configs_root}/disagg_config_conditional_deepseek_v3.yaml"), - "deepseek_v3_lite_fp8_tp1_two_mtp": - (2, - f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml" - ), - "deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp": - (4, - f"{test_configs_root}/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_ctxpp2_gentp2.yaml" - ), - } - - if test_desc not in config_map: - raise ValueError(f"Invalid test description: {test_desc}, " - f"valid descriptions are: {config_map.keys()}") - - return config_map[test_desc] - - -def get_extra_llm_config(config, suffix, cwd): - extra_llm_config = { - 'orchestrator_type': 'ray', - } - for key, value in config.items(): - if key not in ['num_instances', 'urls']: - extra_llm_config[key] = value - - temp_fd, extra_config_file = tempfile.mkstemp(suffix='_%s.yaml' % suffix, - dir=cwd) - with os.fdopen(temp_fd, 'w') as f: - yaml.dump(extra_llm_config, f) - - return extra_config_file - - -def generate_worker_commands(model_path, config, server_config, - extra_config_file, server_role): - worker_commands = [] - - assert model_path, "model path is required." - - for url in server_config['urls']: - host, port = url.split(':') - cmd = [ - 'trtllm-serve', model_path, '--host', host, '--port', port, - '--backend', config['backend'], '--extra_llm_api_options', - extra_config_file, '--server_role', server_role - ] - worker_commands.append(cmd) - return worker_commands - - -def run_client_tests(example_dir, - config_file, - test_desc, - num_iters, - env, - server_start_timeout, - prompt_file, - extra_endpoints_test, - server_url, - workers_proc, - server_proc, - use_ray=False): - """Run client tests against the disaggregated server.""" - client_dir = f"{example_dir}/clients" - for _ in range(num_iters): - client_cmd = [ - 'python3', f'{client_dir}/disagg_client.py', '-c', f'{config_file}', - '-p', f'{client_dir}/{prompt_file}', '--ignore-eos', - '--server-start-timeout', - str(server_start_timeout) - ] - if prompt_file == "long_prompts.json": - # Use max_tokens 4 for long prompts to reduce test time - client_cmd.extend(['--max-tokens', '4']) - - # Prepare poll processes - worker_processes = [] - if use_ray: - for proc_cm in workers_proc: - worker_processes.append(proc_cm.__enter__()) - else: - worker_processes = [workers_proc] - - poll_procs = worker_processes + [server_proc] - check_call(client_cmd, env=env, poll_procs=poll_procs) - - # Streaming client run - streaming_client_cmd = client_cmd + [ - '--streaming', '-o', 'output_streaming.json' - ] - check_call(streaming_client_cmd, env=env, poll_procs=poll_procs) - - # Run the chat completion endpoint test only for TinyLlama - if test_desc == "overlap" or test_desc == "trtllm_sampler": - chat_client_cmd = client_cmd + [ - '-e', 'chat', '-o', 'output_chat.json' - ] - check_call(chat_client_cmd, env=env, poll_procs=poll_procs) - - streaming_chat_client_cmd = chat_client_cmd + [ - '--streaming', '-o', 'output_streaming_chat.json' - ] - check_call(streaming_chat_client_cmd, - env=env, - poll_procs=poll_procs) - - # Skip output verification for long prompts test - if prompt_file == "long_prompts.json": - continue - - if extra_endpoints_test is not None: - extra_endpoints_test(server_url) - - # Verify outputs - not_expected_strings = ["Berlin Berlin"] - - output_files = ['output.json', 'output_streaming.json'] - if test_desc == "overlap" or test_desc == "trtllm_sampler": - # Disable streaming chat completion for overlap test - # due to bug - output_files.extend(['output_chat.json']) - - if test_desc.startswith("gen_only"): - continue - - for output_file in output_files: - with open(output_file, 'r') as f: - content = f.read() - if "deepseek_v3_lite" in test_desc or output_file == "output_chat.json": - expected_strings = [ - "Berlin", ["Asyncio is a", "Asyncio module in"] - ] - else: - expected_strings = [ - "The capital of Germany is Berlin", - "Asyncio is a Python library" - ] - for expected_string in expected_strings: - if isinstance(expected_string, list): - # At least one of the strings in the list should be found in the content - assert any( - string in content for string in expected_string - ), f"None of the strings in {expected_string} found in {output_file}" - else: - assert expected_string in content, f"Expected string '{expected_string}' not found in {output_file}" - for not_expected_string in not_expected_strings: - assert not_expected_string not in content, f"Unexpected string '{not_expected_string}' found in {output_file}" - - -def run_disaggregated_test(example_dir, - test_desc, - num_iters=5, - env=None, - cwd=None, - prompt_file="prompts.json", - extra_endpoints_test: Callable[[str], None] = None, - model_path=None): - """Run disaggregated test with given configuration.""" - cleanup_output_files() - run_env = env.copy() - run_env["UCX_TLS"] = "^ib" - - num_ranks, config_file = get_test_config(test_desc, example_dir, - os.path.dirname(__file__)) - - use_ray = mpi_disabled() - if not use_ray: - workers_cmd = [ - 'mpirun', '--allow-run-as-root', '--oversubscribe', '-n', - str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker', '-c', - config_file - ] - else: - pytest.skip( - "https://nvbugs/5584607 Ray orchestrator is not supported with NIXL(DEFAULT) cache transceiver backend." - ) - with open(config_file, 'r') as f: - config = yaml.safe_load(f) - - if config['backend'] != "pytorch": - pytest.skip( - "Ray orchestrator is only supported with pytorch backend.") - - extra_config_files = [] - workers_cmds = [] - subprocess.run(['ray', 'start', '--head', '--disable-usage-stats'], - check=True) - - # Generate ctx and gen server worker commands - ctx_extra_config_file = get_extra_llm_config(config['context_servers'], - "ctx", cwd) - extra_config_files.append(ctx_extra_config_file) - workers_cmds.extend( - generate_worker_commands(model_path, config, - config['context_servers'], - ctx_extra_config_file, 'context')) - - gen_extra_config_file = get_extra_llm_config( - config['generation_servers'], "gen", cwd) - extra_config_files.append(gen_extra_config_file) - workers_cmds.extend( - generate_worker_commands(model_path, config, - config['generation_servers'], - gen_extra_config_file, 'generation')) - - server_start_timeout = 1200 - server_cmd = [ - 'trtllm-serve', 'disaggregated', '--server_start_timeout', - str(server_start_timeout), '-c', config_file - ] - server_url = get_disagg_server_url_from_cfg(config_file) - - try: - if not use_ray: - with ( # Start workers - open('output_workers.log', 'w') as output_workers, - popen(workers_cmd, - stdout=output_workers, - stderr=subprocess.STDOUT, - env=run_env, - cwd=cwd) as workers_proc, - # Start server - open('output_disagg.log', 'w') as output_disagg, - popen(server_cmd, - stdout=output_disagg, - stderr=subprocess.STDOUT, - env=run_env, - cwd=cwd) as server_proc): - run_client_tests(example_dir, - config_file, - test_desc, - num_iters, - env, - server_start_timeout, - prompt_file, - extra_endpoints_test, - server_url, - workers_proc, - server_proc, - use_ray=False) - - else: - workers_proc = [] - with contextlib.ExitStack() as stack: - workers_log = stack.enter_context( - open('output_workers.log', 'w')) - - for cmd in workers_cmds: - proc = stack.enter_context( - popen( - cmd, - stdout=workers_log, - stderr=subprocess.STDOUT, - env=run_env, - cwd=cwd, - )) - workers_proc.append(proc) - - output_disagg = stack.enter_context( - open('output_disagg.log', 'w')) - server_proc = stack.enter_context( - popen(server_cmd, - stdout=output_disagg, - stderr=subprocess.STDOUT, - env=run_env, - cwd=cwd)) - - if not wait_for_server("localhost", - 8000, - timeout_seconds=server_start_timeout): - raise RuntimeError( - f"Disaggregated server failed to start within {server_start_timeout} seconds" - ) - - run_client_tests(example_dir, - config_file, - test_desc, - num_iters, - env, - server_start_timeout, - prompt_file, - extra_endpoints_test, - server_url, - workers_proc, - server_proc, - use_ray=True) - except Exception: - # Print outputs on error - logger.error("-------- Workers output --------") - with open('output_workers.log', 'r') as f: - logger.error(f.read()) - - logger.error("-------- Disagg server output --------") - with open('output_disagg.log', 'r') as f: - logger.error(f.read()) - raise - finally: - if use_ray: - subprocess.run(['ray', 'stop', '--force'], check=False) - for extra_file in extra_config_files: - if os.path.exists(extra_file): - os.remove(extra_file) - elif 'server_proc' in locals() and 'workers_proc' in locals(): - server_proc.terminate() - workers_proc.terminate() - server_proc.wait() - workers_proc.wait() - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_diff_max_tokens(disaggregated_test_root, - disaggregated_example_root, llm_venv, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "2_ranks_diff_max_tokens", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory(), - prompt_file="long_prompts.json") - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_single_gpu_with_mpirun(disaggregated_test_root, - disaggregated_example_root, - llm_venv, llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "2_ranks", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_single_gpu_with_mpirun_trt_backend( - disaggregated_test_root, disaggregated_example_root, llm_venv, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "2_ranks_trt_backend", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_benchmark_gen_only(disaggregated_test_root, - disaggregated_example_root, llm_venv, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - env = llm_venv._new_env.copy() - env['TRTLLM_DISAGG_BENCHMARK_GEN_ONLY'] = '1' - run_disaggregated_test(disaggregated_example_root, - "gen_only", - env=env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_benchmark_gen_only_trt_backend( - disaggregated_test_root, disaggregated_example_root, llm_venv, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - env = llm_venv._new_env.copy() - env['TRTLLM_DISAGG_BENCHMARK_GEN_ONLY'] = '1' - run_disaggregated_test(disaggregated_example_root, - "gen_only_trt_backend", - env=env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_genbs1(disaggregated_test_root, - disaggregated_example_root, llm_venv, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - env = llm_venv._new_env.copy() - env['TRTLLM_DISAGG_BENCHMARK_GEN_ONLY'] = '1' - run_disaggregated_test(disaggregated_example_root, - "gen_only_bs1", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.skip_less_device(2) -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_multi_gpu_with_mpirun(disaggregated_test_root, - disaggregated_example_root, - llm_venv, llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "4_ranks", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.skip_less_device(2) -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_multi_gpu_with_mpirun_trt_backend( - disaggregated_test_root, disaggregated_example_root, llm_venv, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "4_ranks_trt_backend", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_cuda_graph(disaggregated_test_root, llm_venv, - disaggregated_example_root, llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "cuda_graph", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_mixed(disaggregated_test_root, llm_venv, - disaggregated_example_root, llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "mixed", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_overlap(disaggregated_test_root, llm_venv, - disaggregated_example_root, llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "overlap", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_perf_metrics(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - def extra_endpoints_test(server_url: str): - import json - import urllib.request - - with urllib.request.urlopen(f"{server_url}/perf_metrics", - timeout=10) as resp: - assert resp.status == 200 - perf_metrics = json.load(resp) - assert len(perf_metrics) > 0 - item = perf_metrics[0] - - # Use helper function to validate all timing metrics comprehensively - validate_timing_metrics(item, "perf_metrics test") - - run_disaggregated_test(disaggregated_example_root, - "perf_metrics", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory(), - extra_endpoints_test=extra_endpoints_test) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_kv_cache_time_output(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - output_path = os.path.join(llm_venv.get_working_directory(), "cache_time") - run_disaggregated_test(disaggregated_example_root, - "perf_metrics", - env=llm_venv._new_env - | {"TRTLLM_KVCACHE_TIME_OUTPUT_PATH": output_path}, - cwd=llm_venv.get_working_directory()) - assert os.path.isdir(output_path) - send_file = os.path.join(output_path, "rank_0_send.csv") - recv_file = os.path.join(output_path, "rank_1_recv.csv") - assert os.path.exists(send_file) - assert os.path.exists(recv_file) - with open(send_file, "r") as f: - lines = f.readlines() - assert len(lines) > 1 - assert lines[0].startswith( - "RequestID,RequestInfo,Preparation,Preprocess,Transmissions,Postprocess" - ) - assert ",Delay,Duration,Bandwidth(Gbps)" in lines[0] - # get a send sample and match the recv - sample = lines[1].split(',') - assert len(sample) >= 9 - with open(recv_file, "r") as f: - lines = f.readlines() - assert len(lines) > 1 - matched = False - for line in lines: - sample_recv = line.split(',') - if sample_recv[0] == sample[0]: - matched = True - assert float(sample_recv[1]) <= float(sample[1]) - break - assert matched - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_trtllm_sampler(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "trtllm_sampler", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_load_balance(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "load_balance", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_cache_aware_balance(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "cache_aware_balance", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_conditional(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "conditional", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_ngram(disaggregated_test_root, llm_venv, - disaggregated_example_root, llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - run_disaggregated_test(disaggregated_example_root, - "ngram", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_ctxpp2_genpp2(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - run_disaggregated_test(disaggregated_example_root, - "ctxpp2_genpp2", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory(), - model_path=llama_model_root) - - -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_ctxtp2_genpp2(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - run_disaggregated_test(disaggregated_example_root, - "ctxtp2_genpp2", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory(), - model_path=llama_model_root) - - -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_ctxpp2_gentp2(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - run_disaggregated_test(disaggregated_example_root, - "ctxpp2_gentp2", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory(), - model_path=llama_model_root) - - -@pytest.mark.skip_less_device(8) -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_ctxtp2pp2_gentp2pp2(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - run_disaggregated_test(disaggregated_example_root, - "ctxtp2pp2_gentp2pp2", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.skip_less_device(8) -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_ctxpp4_genpp4(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - run_disaggregated_test(disaggregated_example_root, - "ctxpp4_genpp4", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -#tiny llama pp4 will have uneven layer per pp. pp4 -@pytest.mark.skip_less_device(8) -@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], - indirect=True) -def test_disaggregated_ctxpp4_gentp4(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): - src_dst_dict = { - llama_model_root: - f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - run_disaggregated_test(disaggregated_example_root, - "ctxpp4_gentp4", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory(), - model_path=llama_model_root) - - -@skip_no_hopper -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_mpi(disaggregated_test_root, - disaggregated_example_root, - llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - env = llm_venv._new_env.copy() - env["TRTLLM_USE_MPI_KVCACHE"] = "1" - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_mpi", - env=env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_tp1", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_tp1_mtp", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.mark.skip_less_device(4) -@skip_no_hopper -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - #add one mtp layer, pp rank0 will have 15 layer, pp rank 1 will have 16 layers. - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory(), - model_path=deepseek_v3_model_root) - - -@skip_no_hopper -@skip_arm -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_ucx(disaggregated_test_root, - disaggregated_example_root, - llm_venv, - deepseek_v3_model_root): - - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - env = llm_venv._new_env.copy() - env["TRTLLM_USE_UCX_KVCACHE"] = "1" - env["UCX_TLS"] = "^ib" - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_ucx", - env=env, - cwd=llm_venv.get_working_directory(), - model_path=deepseek_v3_model_root) - - -@skip_no_hopper -@skip_arm -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_nixl(disaggregated_test_root, - disaggregated_example_root, - llm_venv, - deepseek_v3_model_root): - - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - env = llm_venv._new_env.copy() - env["TRTLLM_USE_NIXL_KVCACHE"] = "1" - env["UCX_TLS"] = "^ib" - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_nixl", - env=env, - cwd=llm_venv.get_working_directory(), - model_path=deepseek_v3_model_root) - - -@skip_no_hopper -@skip_arm -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - env = llm_venv._new_env.copy() - env["TRTLLM_USE_UCX_KVCACHE"] = "1" - env["UCX_TLS"] = "^ib" - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_tp1", - env=env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_attention_dp( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_attention_dp", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap( - disaggregated_test_root, llm_venv, disaggregated_example_root, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp_8_attention_dp_overlap", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test( - disaggregated_example_root, - "deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_overlap_cuda_graph", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_attention_dp_one", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_attention_dp_one_mtp", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.skip_less_device(4) -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test( - disaggregated_example_root, - "deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory(), - model_path=deepseek_v3_model_root) - - -@skip_no_hopper -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-bf16'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/bf16", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_bf16_cache_aware_balance", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-bf16'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_bf16_conditional( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/bf16", - } - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_bf16_conditional", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@skip_no_hopper -@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'], - indirect=True) -def test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp( - disaggregated_test_root, disaggregated_example_root, llm_venv, - deepseek_v3_model_root): - src_dst_dict = { - deepseek_v3_model_root: - f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8", - } - - for src, dst in src_dst_dict.items(): - if not os.path.islink(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.symlink(src, dst, target_is_directory=True) - - run_disaggregated_test(disaggregated_example_root, - "deepseek_v3_lite_fp8_tp1_two_mtp", - env=llm_venv._new_env, - cwd=llm_venv.get_working_directory()) - - -@pytest.fixture(scope="module") -def benchmark_root(): - llm_root = os.getenv("LLM_ROOT") - return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts") - - -@pytest.fixture(scope="module") -def shared_gpt_path(): - DEFAULT_LLM_MODEL_ROOT = os.path.join("/scratch.trt_llm_data", "llm-models") - LLM_MODELS_ROOT = os.environ.get("LLM_MODELS_ROOT", DEFAULT_LLM_MODEL_ROOT) - return os.path.join(LLM_MODELS_ROOT, "datasets", - "ShareGPT_V3_unfiltered_cleaned_split.json") - - -@pytest.fixture(scope="function") -def benchmark_model_root(request): - models_root = llm_models_root() - if (request.param == "DeepSeek-V3-Lite-fp8"): - model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "fp8") - elif (request.param == "DeepSeek-V3-Lite-bf16"): - model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "bf16") - elif request.param == "llama-v3-8b-hf": - model_path = os.path.join(models_root, "llama-models-v3", "8B") - elif request.param == "llama-3.1-8b-instruct-hf-fp8": - model_path = os.path.join(models_root, "llama-3.1-model", - "Llama-3.1-8B-Instruct-FP8") - else: - raise ValueError(f"Failed to find the model: {request.param}") - return model_path - - -def run_disaggregated_benchmark(example_dir, - config_file, - benchmark_root, - benchmark_model_root, - shared_gpt_path, - env=None, - cwd=None): - """Run disaggregated test with given configuration.""" - run_env = env.copy() - run_env["UCX_TLS"] = "^ib" - num_rank = 2 - workers_cmd = [ - 'mpirun', '--allow-run-as-root', '--oversubscribe', '-n', - str(num_rank), 'trtllm-serve', 'disaggregated_mpi_worker', '-c', - config_file - ] - - server_start_timeout = 900 - server_cmd = [ - 'trtllm-serve', 'disaggregated', '--server_start_timeout', - str(server_start_timeout), '-c', config_file - ] - try: - with ( # Start workers - open('output_workers.log', 'w') as output_workers, - popen(workers_cmd, - stdout=output_workers, - stderr=subprocess.STDOUT, - env=run_env, - cwd=cwd) as workers_proc, - # Start server - open('output_disagg.log', 'w') as output_disagg, - popen(server_cmd, - stdout=output_disagg, - stderr=subprocess.STDOUT, - env=run_env, - cwd=cwd) as server_proc): - # Ensure the sever has started - client_dir = f"{example_dir}/clients" - client_cmd = [ - 'python3', f'{client_dir}/disagg_client.py', '-c', - f'{example_dir}/disagg_config.yaml', '-p', - f'{client_dir}/prompts.json', '--ignore-eos', - '--server-start-timeout', - str(server_start_timeout) - ] - # Warm up - check_call(client_cmd, - env=env, - poll_procs=[workers_proc, server_proc]) - # Start Benchmark - benchmark_script = os.path.join(benchmark_root, - "benchmark_serving.py") - benchmark_cmd = [ - 'python3', - benchmark_script, - '--model', - benchmark_model_root, - '--tokenizer', - benchmark_model_root, - '--dataset-name', - 'random', - '--dataset-path', - shared_gpt_path, - '--random-input-len', - '256', - '--random-output-len', - '64', - '--random-prefix-len', - '0', - '--num-prompts', - '320', - '--max-concurrency', - '32', - '--host', - 'localhost', - '--port', - '8000', - '--ignore-eos', - '--no-test-input', - '--percentile-metrics', - 'e2el,ttft', - ] - # warm up - check_call(benchmark_cmd, env=env) - output = check_output(benchmark_cmd, env=env) - e2el_pattern = r"Median E2EL \(ms\):\s*(\d+\.?\d*)" - ttft_pattern = r"Median TTFT \(ms\):\s*(\d+\.?\d*)" - e2el_match = re.search(e2el_pattern, output) - ttft_match = re.search(ttft_pattern, output) - if e2el_match and ttft_match: - median_e2el = float(e2el_match.group(1)) - median_ttft = float(ttft_match.group(1)) - return median_e2el, median_ttft - else: - raise ValueError("No benchmark result found") - - except Exception: - # Print outputs on error - logger.error("-------- Workers output --------") - with open('output_workers.log', 'r') as f: - logger.error(f.read()) - - logger.error("-------- Disagg server output --------") - with open('output_disagg.log', 'r') as f: - logger.error(f.read()) - raise - finally: - server_proc.terminate() - workers_proc.terminate() - server_proc.wait() - workers_proc.wait() - - -def get_config_for_benchmark(model_root, backend): - serve_config = { - "model": model_root, - "hostname": "localhost", - "port": 8000, - "backend": "pytorch", - "context_servers": { - "num_instances": 1, - "max_batch_size": 2, - "max_num_tokens": 384, - "max_seq_len": 384, - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "disable_overlap_scheduler": True, - "cache_transceiver_config": { - "backend": backend, - "max_tokens_in_buffer": 512, - }, - "urls": ["localhost:8001"] - }, - "generation_servers": { - "num_instances": 1, - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "max_batch_size": 2, - "max_num_tokens": 384, - "max_seq_len": 384, - "cache_transceiver_config": { - "backend": backend, - "max_tokens_in_buffer": 512, - }, - "urls": ["localhost:8002"] - } - } - return serve_config - - -@pytest.mark.parametrize("benchmark_model_root", [ - 'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf', - 'llama-3.1-8b-instruct-hf-fp8' -], - indirect=True) -def test_disaggregated_benchmark_on_diff_backends( - disaggregated_test_root, disaggregated_example_root, llm_venv, - benchmark_model_root, benchmark_root, shared_gpt_path): - if "DeepSeek-V3-Lite" in benchmark_model_root and "fp8" in benchmark_model_root and get_sm_version( - ) != 90: - pytest.skip("The test should only run on Hopper") - nixl_config = get_config_for_benchmark(benchmark_model_root, "NIXL") - ucx_config = get_config_for_benchmark(benchmark_model_root, "UCX") - temp_dir = tempfile.TemporaryDirectory() - nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml") - ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml") - with open(nixl_config_path, 'w', encoding='utf-8') as f: - yaml.dump(nixl_config, f) - with open(ucx_config_path, 'w', encoding='utf-8') as f: - yaml.dump(ucx_config, f) - - env = llm_venv._new_env.copy() - nixl_e2el, nixl_ttft = run_disaggregated_benchmark( - disaggregated_example_root, - nixl_config_path, - benchmark_root, - benchmark_model_root, - shared_gpt_path, - env=env, - cwd=llm_venv.get_working_directory()) - ucx_e2el, ucx_ttft = run_disaggregated_benchmark( - disaggregated_example_root, - ucx_config_path, - benchmark_root, - benchmark_model_root, - shared_gpt_path, - env=env, - cwd=llm_venv.get_working_directory()) - print(f"Nixl E2EL: {nixl_e2el} ms, UCX E2EL: {ucx_e2el} ms") - print(f"Nixl TTFT: {nixl_ttft} ms, UCX TTFT: {ucx_ttft} ms") - - assert ucx_e2el > 0 and nixl_e2el > 0 and nixl_e2el < 1.05 * ucx_e2el - assert ucx_ttft > 0 and nixl_ttft > 0 and nixl_ttft < 1.05 * ucx_ttft diff --git a/tests/integration/defs/disaggregated/test_disaggregated_benchmark.py b/tests/integration/defs/disaggregated/test_disaggregated_benchmark.py new file mode 100644 index 00000000000..3f36b638003 --- /dev/null +++ b/tests/integration/defs/disaggregated/test_disaggregated_benchmark.py @@ -0,0 +1,255 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import subprocess +import tempfile + +import pytest +import yaml +from defs.conftest import get_sm_version, llm_models_root +from defs.trt_test_alternative import check_call, check_output, popen + +from tensorrt_llm.logger import logger + + +@pytest.fixture(scope="module") +def benchmark_root(): + llm_root = os.getenv("LLM_ROOT") + return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts") + + +@pytest.fixture(scope="module") +def shared_gpt_path(): + DEFAULT_LLM_MODEL_ROOT = os.path.join("/scratch.trt_llm_data", "llm-models") + LLM_MODELS_ROOT = os.environ.get("LLM_MODELS_ROOT", DEFAULT_LLM_MODEL_ROOT) + return os.path.join(LLM_MODELS_ROOT, "datasets", + "ShareGPT_V3_unfiltered_cleaned_split.json") + + +@pytest.fixture(scope="function") +def benchmark_model_root(request): + models_root = llm_models_root() + if (request.param == "DeepSeek-V3-Lite-fp8"): + model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "fp8") + elif (request.param == "DeepSeek-V3-Lite-bf16"): + model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "bf16") + elif request.param == "llama-v3-8b-hf": + model_path = os.path.join(models_root, "llama-models-v3", "8B") + elif request.param == "llama-3.1-8b-instruct-hf-fp8": + model_path = os.path.join(models_root, "llama-3.1-model", + "Llama-3.1-8B-Instruct-FP8") + else: + raise ValueError(f"Failed to find the model: {request.param}") + return model_path + + +def run_disaggregated_benchmark(example_dir, + config_file, + benchmark_root, + benchmark_model_root, + shared_gpt_path, + env=None, + cwd=None): + """Run disaggregated benchmark with given configuration.""" + run_env = env.copy() + run_env["UCX_TLS"] = "^ib" + num_rank = 2 + workers_cmd = [ + 'mpirun', '--allow-run-as-root', '--oversubscribe', '-n', + str(num_rank), 'trtllm-serve', 'disaggregated_mpi_worker', '-c', + config_file + ] + + server_start_timeout = 900 + server_cmd = [ + 'trtllm-serve', 'disaggregated', '--server_start_timeout', + str(server_start_timeout), '-c', config_file + ] + try: + with ( # Start workers + open('output_workers.log', 'w') as output_workers, + popen(workers_cmd, + stdout=output_workers, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd) as workers_proc, + # Start server + open('output_disagg.log', 'w') as output_disagg, + popen(server_cmd, + stdout=output_disagg, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd) as server_proc): + # Ensure the server has started + client_dir = f"{example_dir}/clients" + client_cmd = [ + 'python3', f'{client_dir}/disagg_client.py', '-c', + f'{example_dir}/disagg_config.yaml', '-p', + f'{client_dir}/prompts.json', '--ignore-eos', + '--server-start-timeout', + str(server_start_timeout) + ] + # Warm up + check_call(client_cmd, + env=env, + poll_procs=[workers_proc, server_proc]) + # Start Benchmark + benchmark_script = os.path.join(benchmark_root, + "benchmark_serving.py") + benchmark_cmd = [ + 'python3', + benchmark_script, + '--model', + benchmark_model_root, + '--tokenizer', + benchmark_model_root, + '--dataset-name', + 'random', + '--dataset-path', + shared_gpt_path, + '--random-input-len', + '256', + '--random-output-len', + '64', + '--random-prefix-len', + '0', + '--num-prompts', + '320', + '--max-concurrency', + '32', + '--host', + 'localhost', + '--port', + '8000', + '--ignore-eos', + '--no-test-input', + '--percentile-metrics', + 'e2el,ttft', + ] + # warm up + check_call(benchmark_cmd, env=env) + output = check_output(benchmark_cmd, env=env) + e2el_pattern = r"Median E2EL \(ms\):\s*(\d+\.?\d*)" + ttft_pattern = r"Median TTFT \(ms\):\s*(\d+\.?\d*)" + e2el_match = re.search(e2el_pattern, output) + ttft_match = re.search(ttft_pattern, output) + if e2el_match and ttft_match: + median_e2el = float(e2el_match.group(1)) + median_ttft = float(ttft_match.group(1)) + return median_e2el, median_ttft + else: + raise ValueError("No benchmark result found") + + except Exception: + # Print outputs on error + logger.error("-------- Workers output --------") + with open('output_workers.log', 'r') as f: + logger.error(f.read()) + + logger.error("-------- Disagg server output --------") + with open('output_disagg.log', 'r') as f: + logger.error(f.read()) + raise + finally: + server_proc.terminate() + workers_proc.terminate() + server_proc.wait() + workers_proc.wait() + + +def get_config_for_benchmark(model_root, backend): + """Generate config for benchmark test.""" + serve_config = { + "model": model_root, + "hostname": "localhost", + "port": 8000, + "backend": "pytorch", + "context_servers": { + "num_instances": 1, + "max_batch_size": 2, + "max_num_tokens": 384, + "max_seq_len": 384, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": backend, + "max_tokens_in_buffer": 512, + }, + "urls": ["localhost:8001"] + }, + "generation_servers": { + "num_instances": 1, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "max_batch_size": 2, + "max_num_tokens": 384, + "max_seq_len": 384, + "cache_transceiver_config": { + "backend": backend, + "max_tokens_in_buffer": 512, + }, + "urls": ["localhost:8002"] + } + } + return serve_config + + +@pytest.mark.parametrize("benchmark_model_root", [ + 'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf', + 'llama-3.1-8b-instruct-hf-fp8' +], + indirect=True) +def test_disaggregated_benchmark_on_diff_backends( + disaggregated_test_root, disaggregated_example_root, llm_venv, + benchmark_model_root, benchmark_root, shared_gpt_path): + """Benchmark test comparing NIXL vs UCX cache transceiver backends.""" + if "DeepSeek-V3-Lite" in benchmark_model_root and "fp8" in benchmark_model_root and get_sm_version( + ) != 90: + pytest.skip("The test should only run on Hopper") + nixl_config = get_config_for_benchmark(benchmark_model_root, "NIXL") + ucx_config = get_config_for_benchmark(benchmark_model_root, "UCX") + temp_dir = tempfile.TemporaryDirectory() + nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml") + ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml") + with open(nixl_config_path, 'w', encoding='utf-8') as f: + yaml.dump(nixl_config, f) + with open(ucx_config_path, 'w', encoding='utf-8') as f: + yaml.dump(ucx_config, f) + + env = llm_venv._new_env.copy() + nixl_e2el, nixl_ttft = run_disaggregated_benchmark( + disaggregated_example_root, + nixl_config_path, + benchmark_root, + benchmark_model_root, + shared_gpt_path, + env=env, + cwd=llm_venv.get_working_directory()) + ucx_e2el, ucx_ttft = run_disaggregated_benchmark( + disaggregated_example_root, + ucx_config_path, + benchmark_root, + benchmark_model_root, + shared_gpt_path, + env=env, + cwd=llm_venv.get_working_directory()) + print(f"Nixl E2EL: {nixl_e2el} ms, UCX E2EL: {ucx_e2el} ms") + print(f"Nixl TTFT: {nixl_ttft} ms, UCX TTFT: {ucx_ttft} ms") + + assert ucx_e2el > 0 and nixl_e2el > 0 and nixl_e2el < 1.05 * ucx_e2el + assert ucx_ttft > 0 and nixl_ttft > 0 and nixl_ttft < 1.05 * ucx_ttft diff --git a/tests/integration/defs/disaggregated/test_disaggregated_parametrized.py b/tests/integration/defs/disaggregated/test_disaggregated_parametrized.py new file mode 100644 index 00000000000..217f7a37620 --- /dev/null +++ b/tests/integration/defs/disaggregated/test_disaggregated_parametrized.py @@ -0,0 +1,1464 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Dict, Optional + +import pytest +import yaml +from defs.conftest import skip_arm, skip_no_hopper +from defs.trt_test_alternative import check_call + + +# Utility functions for disaggregated tests +def cleanup_output_files(): + """Clean up output files from previous runs.""" + for file in ['output.json', 'output_streaming.json']: + try: + os.remove(file) + except FileNotFoundError: + pass + + +def get_disagg_server_url_from_cfg(config_file: str) -> str: + """Extract server URL from configuration file. + + Args: + config_file: Path to the YAML configuration file. + + Returns: + Server URL in format "http://hostname:port" + """ + with open(config_file, 'r') as file: + config = yaml.safe_load(file) + server_host = config.get('hostname', 'localhost') + server_port = config.get('port', 8000) + return f"http://{server_host}:{server_port}" + + +def validate_timing_metrics(perf_metrics_item, request_context=""): + """ + Helper function to validate timing metrics relationships. + + Args: + perf_metrics_item: A single performance metrics item from the /perf_metrics endpoint + request_context: String context for error messages (e.g., "request 1", "streaming") + """ + # Validate basic structure + required_keys = [ + "ctx_server", "gen_server", "ctx_perf_metrics", "gen_perf_metrics", + "disagg_server_arrival_time", "disagg_server_first_token_time" + ] + for key in required_keys: + assert key in perf_metrics_item, f"Missing key: {key} in {request_context}" + + assert perf_metrics_item["ctx_perf_metrics"][ + "ctx_request_id"] == perf_metrics_item["gen_perf_metrics"][ + "ctx_request_id"] + + # Extract timing metrics + ctx_metrics = perf_metrics_item["ctx_perf_metrics"]["perf_metrics"][ + "timing_metrics"] + gen_metrics = perf_metrics_item["gen_perf_metrics"]["perf_metrics"][ + "timing_metrics"] + disagg_arrival = perf_metrics_item["disagg_server_arrival_time"] + disagg_first_token = perf_metrics_item["disagg_server_first_token_time"] + + # Validate disaggregated server timing metrics + assert disagg_arrival is not None, f"disagg_server_arrival_time is None in {request_context}" + assert disagg_first_token is not None, f"disagg_server_first_token_time is None in {request_context}" + assert isinstance( + disagg_arrival, + (int, float + )), f"disagg_server_arrival_time is not numeric in {request_context}" + assert isinstance( + disagg_first_token, (int, float) + ), f"disagg_server_first_token_time is not numeric in {request_context}" + assert disagg_arrival > 0, f"disagg_server_arrival_time is not positive in {request_context}" + assert disagg_first_token > 0, f"disagg_server_first_token_time is not positive in {request_context}" + assert disagg_arrival <= disagg_first_token, f"disagg_server_arrival_time > disagg_server_first_token_time in {request_context}" + + # Validate server-level timing metrics for context server + ctx_server_arrival = ctx_metrics.get("server_arrival_time") + ctx_server_first_token = ctx_metrics.get("server_first_token_time") + assert ctx_server_arrival is not None, f"ctx server_arrival_time is None in {request_context}" + assert ctx_server_first_token is not None, f"ctx server_first_token_time is None in {request_context}" + assert isinstance( + ctx_server_arrival, + (int, + float)), f"ctx server_arrival_time is not numeric in {request_context}" + assert isinstance( + ctx_server_first_token, + (int, float + )), f"ctx server_first_token_time is not numeric in {request_context}" + assert ctx_server_arrival <= ctx_server_first_token, f"ctx server_arrival_time > server_first_token_time in {request_context}" + assert ctx_metrics["last_token_time"] - ctx_server_first_token < 1e-3 + + # Validate server-level timing metrics for generation server + gen_server_arrival = gen_metrics.get("server_arrival_time") + gen_server_first_token = gen_metrics.get("server_first_token_time") + assert gen_server_arrival is not None, f"gen server_arrival_time is None in {request_context}" + assert gen_server_first_token is not None, f"gen server_first_token_time is None in {request_context}" + assert isinstance( + gen_server_arrival, + (int, + float)), f"gen server_arrival_time is not numeric in {request_context}" + assert isinstance( + gen_server_first_token, + (int, float + )), f"gen server_first_token_time is not numeric in {request_context}" + assert gen_server_arrival <= gen_server_first_token, f"gen server_arrival_time > server_first_token_time in {request_context}" + + # Validate timing relationships between different levels + # Disaggregated server should receive request before individual servers + assert disagg_arrival <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}" + assert disagg_arrival <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}" + + # Context should complete before generation starts + assert ctx_server_first_token <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}" + + # Validate internal timing consistency + ctx_arrival_time = ctx_metrics["arrival_time"] + ctx_first_token_time = ctx_metrics["first_token_time"] + gen_arrival_time = gen_metrics["arrival_time"] + gen_first_token_time = gen_metrics["first_token_time"] + + assert ctx_arrival_time <= ctx_first_token_time, f"ctx arrival_time > first_token_time in {request_context}" + assert gen_arrival_time <= gen_first_token_time, f"gen arrival_time > first_token_time in {request_context}" + + # Test KV cache transfer timing (if present) + if "kv_cache_transfer_start" in gen_metrics and "kv_cache_transfer_end" in gen_metrics: + kv_start = gen_metrics["kv_cache_transfer_start"] + kv_end = gen_metrics["kv_cache_transfer_end"] + assert gen_metrics["kv_cache_size"] > 0 + assert kv_start <= kv_end, f"kv_cache_transfer_start > kv_cache_transfer_end in {request_context}" + assert gen_arrival_time <= kv_start, f"gen_arrival_time > kv_cache_transfer_start in {request_context}" + assert kv_end <= gen_metrics[ + "first_scheduled_time"], f"kv_cache_transfer_end > first_scheduled_time in {request_context}" + + return True + + +def run_client_tests(example_dir, + config_file, + test_desc, + num_iters, + env, + server_start_timeout, + prompt_file, + extra_endpoints_test, + server_url, + workers_proc, + server_proc, + use_ray=False): + """Run client tests against the disaggregated server. + + Args: + example_dir: Path to the examples directory + config_file: Path to the configuration file + test_desc: Test description/name + num_iters: Number of iterations to run + env: Environment variables + server_start_timeout: Timeout for server startup + prompt_file: Name of the prompt file to use + extra_endpoints_test: Optional callback for extra endpoint tests + server_url: URL of the disaggregated server + workers_proc: Worker process(es) + server_proc: Server process + use_ray: Whether Ray orchestrator is being used + """ + client_dir = f"{example_dir}/clients" + for _ in range(num_iters): + client_cmd = [ + 'python3', f'{client_dir}/disagg_client.py', '-c', f'{config_file}', + '-p', f'{client_dir}/{prompt_file}', '--ignore-eos', + '--server-start-timeout', + str(server_start_timeout) + ] + if prompt_file == "long_prompts.json": + # Use max_tokens 4 for long prompts to reduce test time + client_cmd.extend(['--max-tokens', '4']) + + # Prepare poll processes + worker_processes = [] + if use_ray: + for proc_cm in workers_proc: + worker_processes.append(proc_cm.__enter__()) + else: + worker_processes = [workers_proc] + + poll_procs = worker_processes + [server_proc] + check_call(client_cmd, env=env, poll_procs=poll_procs) + + # Streaming client run + streaming_client_cmd = client_cmd + [ + '--streaming', '-o', 'output_streaming.json' + ] + check_call(streaming_client_cmd, env=env, poll_procs=poll_procs) + + # Run the chat completion endpoint test only for TinyLlama + if test_desc == "overlap" or test_desc == "trtllm_sampler": + chat_client_cmd = client_cmd + [ + '-e', 'chat', '-o', 'output_chat.json' + ] + check_call(chat_client_cmd, env=env, poll_procs=poll_procs) + + streaming_chat_client_cmd = chat_client_cmd + [ + '--streaming', '-o', 'output_streaming_chat.json' + ] + check_call(streaming_chat_client_cmd, + env=env, + poll_procs=poll_procs) + + # Skip output verification for long prompts test + if prompt_file == "long_prompts.json": + continue + + if extra_endpoints_test is not None: + extra_endpoints_test(server_url) + + # Verify outputs + not_expected_strings = ["Berlin Berlin"] + + output_files = ['output.json', 'output_streaming.json'] + if test_desc == "overlap" or test_desc == "trtllm_sampler": + # Disable streaming chat completion for overlap test + # due to bug + output_files.extend(['output_chat.json']) + + if test_desc.startswith("gen_only"): + continue + + for output_file in output_files: + with open(output_file, 'r') as f: + content = f.read() + if "ds_v3_lite" in test_desc or output_file == "output_chat.json": + expected_strings = [ + "Berlin", ["Asyncio is a", "Asyncio module in"] + ] + else: + expected_strings = [ + "The capital of Germany is Berlin", + "Asyncio is a Python library" + ] + for expected_string in expected_strings: + if isinstance(expected_string, list): + # At least one of the strings in the list should be found in the content + assert any( + string in content for string in expected_string + ), f"None of the strings in {expected_string} found in {output_file}" + else: + assert expected_string in content, f"Expected string '{expected_string}' not found in {output_file}" + for not_expected_string in not_expected_strings: + assert not_expected_string not in content, f"Unexpected string '{not_expected_string}' found in {output_file}" + + +@dataclass +class DisaggregatedTestConfig: + """Complete configuration for a disaggregated test.""" + test_name: str + model_root: str + + # Global config + global_config: dict = field(default_factory=dict) + # Ctx config + ctx_config: dict = field(default_factory=dict) + # Gen config + gen_config: dict = field(default_factory=dict) + + # Test specific settings + skip_device_count: Optional[int] = None + skip_hopper: bool = False + skip_arm_arch: bool = False + env_vars: Optional[Dict[str, str]] = None + prompt_file: str = "prompts.json" + num_iters: int = 5 + extra_validation: Optional[ + str] = None # Special validation type: 'perf_metrics', 'kv_cache_time' + + @staticmethod + def _deep_merge_dicts(base: dict, override: dict) -> dict: + """Deep merge two dictionaries. + + Args: + base: Base dictionary to start from + override: Dictionary with values to override/add + + Returns: + New dictionary with merged values + """ + result = deepcopy(base) + for key, value in override.items(): + if key in result and isinstance(result[key], dict) and isinstance( + value, dict): + result[key] = DisaggregatedTestConfig._deep_merge_dicts( + result[key], value) + else: + result[key] = deepcopy(value) + return result + + @classmethod + def from_base( + cls, + base: 'DisaggregatedTestConfig', + test_name: str, + model_root: Optional[str] = None, + global_config: Optional[dict] = None, + ctx_config: Optional[dict] = None, + gen_config: Optional[dict] = None, + skip_device_count: Optional[int] = None, + skip_hopper: Optional[bool] = None, + skip_arm_arch: Optional[bool] = None, + env_vars: Optional[Dict[str, str]] = None, + prompt_file: Optional[str] = None, + num_iters: Optional[int] = None, + extra_validation: Optional[str] = None + ) -> 'DisaggregatedTestConfig': + """Create a new config based on an existing one with selective overrides. + + Args: + base: Base configuration to inherit from + test_name: Name for the new test (required) + model_root: Override model root (if None, uses base.model_root) + global_config: Dictionary to merge into base global_config + ctx_config: Dictionary to merge into base ctx_config + gen_config: Dictionary to merge into base gen_config + skip_device_count: Override skip_device_count (if None, uses base value) + skip_hopper: Override skip_hopper (if None, uses base value) + skip_arm_arch: Override skip_arm_arch (if None, uses base value) + env_vars: Override or merge with base env_vars + prompt_file: Override prompt_file (if None, uses base value) + num_iters: Override num_iters (if None, uses base value) + extra_validation: Override extra_validation (if None, uses base value) + + Returns: + New DisaggregatedTestConfig instance + """ + # Deep copy base configs + new_global_config = deepcopy(base.global_config) + new_ctx_config = deepcopy(base.ctx_config) + new_gen_config = deepcopy(base.gen_config) + new_env_vars = deepcopy(base.env_vars) if base.env_vars else None + + # Merge provided overrides + + # Remove any parameters from global_config that are already specified in ctx_config or gen_config + for key in list(new_global_config.keys()): + if (ctx_config is not None + and key in ctx_config) or (gen_config is not None + and key in gen_config): + new_global_config.pop(key, None) + + if global_config: + new_global_config = cls._deep_merge_dicts(new_global_config, + global_config) + if ctx_config: + new_ctx_config = cls._deep_merge_dicts(new_ctx_config, ctx_config) + if gen_config: + new_gen_config = cls._deep_merge_dicts(new_gen_config, gen_config) + if env_vars: + if new_env_vars: + new_env_vars = {**new_env_vars, **env_vars} + else: + new_env_vars = env_vars.copy() + + return cls( + test_name=test_name, + model_root=model_root + if model_root is not None else base.model_root, + global_config=new_global_config, + ctx_config=new_ctx_config, + gen_config=new_gen_config, + skip_device_count=skip_device_count + if skip_device_count is not None else base.skip_device_count, + skip_hopper=skip_hopper + if skip_hopper is not None else base.skip_hopper, + skip_arm_arch=skip_arm_arch + if skip_arm_arch is not None else base.skip_arm_arch, + env_vars=new_env_vars, + prompt_file=prompt_file + if prompt_file is not None else base.prompt_file, + num_iters=num_iters if num_iters is not None else base.num_iters, + extra_validation=extra_validation + if extra_validation is not None else base.extra_validation, + ) + + def get_num_ranks(self) -> int: + """Calculate total number of ranks needed.""" + ctx_tp = self.ctx_config.get('tensor_parallel_size', 1) + ctx_pp = self.ctx_config.get('pipeline_parallel_size', 1) + ctx_num_instances = self.ctx_config.get('num_instances', 1) + + gen_tp = self.gen_config.get('tensor_parallel_size', 1) + gen_pp = self.gen_config.get('pipeline_parallel_size', 1) + gen_num_instances = self.gen_config.get('num_instances', 1) + + ctx_ranks = ctx_tp * ctx_pp * ctx_num_instances + gen_ranks = gen_tp * gen_pp * gen_num_instances + return ctx_ranks + gen_ranks + + def generate_yaml_config(self, temp_dir: str) -> str: + """Generate a yaml config file from the parameters.""" + config = self.global_config.copy() + config["model"] = self.model_root + + # Add default cache_transceiver_config if not present + if "cache_transceiver_config" not in config: + config["cache_transceiver_config"] = {"backend": "DEFAULT"} + + if "backend" in config and config["backend"] == "trt": + del config["disable_overlap_scheduler"] + del config["cuda_graph_config"] + + # Build context servers config + context_servers = self.ctx_config.copy() + + ctx_num_instances = self.ctx_config.get('num_instances', 1) + context_servers["num_instances"] = ctx_num_instances + + ctx_urls = [] + base_port = 8001 + for i in range(ctx_num_instances): + ctx_urls.append(f"localhost:{base_port + i}") + context_servers["urls"] = ctx_urls + config["context_servers"] = context_servers + + # Build generation servers config + gen_servers = self.gen_config.copy() + + gen_num_instances = self.gen_config.get('num_instances', 1) + gen_servers["num_instances"] = gen_num_instances + + gen_urls = [] + base_port = 8001 + ctx_num_instances + for i in range(gen_num_instances): + gen_urls.append(f"localhost:{base_port + i}") + gen_servers["urls"] = gen_urls + + # Special handling for gen-only mode + if ctx_num_instances == 0 and "backend" in config and config[ + "backend"] == "pytorch": + gen_servers["print_iter_log"] = True + + config["generation_servers"] = gen_servers + + # Write to temporary file + config_path = os.path.join(temp_dir, f"{self.test_name}.yaml") + with open(config_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False) + + return config_path + + +# Define all test configurations +# +# Usage: You can create test configs from scratch or use from_base() to inherit from existing configs: +# +# Example 1 - Create base config: +# base_config = DisaggregatedTestConfig( +# test_name="base", +# model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0", +# global_config={"backend": "pytorch"} +# ) +# +# Example 2 - Create variation with different backend: +# trt_config = DisaggregatedTestConfig.from_base( +# base_config, +# test_name="trt_variant", +# global_config={"backend": "trt"} # This merges/overrides into base +# ) +# +# Example 3 - Create variation with additional nested config: +# perf_config = DisaggregatedTestConfig.from_base( +# base_config, +# test_name="perf_variant", +# ctx_config={"return_perf_metrics": True}, # Merges with base ctx_config +# extra_validation="perf_metrics" +# ) + +# Store some base configs for reuse +_tiny_llama_cfg = DisaggregatedTestConfig( + test_name="2_ranks", + model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + global_config={ + "backend": "pytorch", + "kv_cache_config": { + "free_gpu_memory_fraction": 0.2, + "enable_partial_reuse": False + }, + "disable_overlap_scheduler": True, + "cuda_graph_config": None, + }) + +_tiny_llama_multi_gpus_cfg = DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="multi_gpus", + global_config={ + "kv_cache_config": { + "free_gpu_memory_fraction": 0.2, + "enable_partial_reuse": False, + "enable_block_reuse": False, + }, + }, + ctx_config={ + "max_batch_size": 1, + "max_num_tokens": 3000, + "max_seq_len": 4096, + }, + gen_config={ + "max_batch_size": 256, + "max_num_tokens": 4096, + "max_seq_len": 4096, + }, + skip_device_count=4, +) + +_ds_v3_lite_tp1_cfg = DisaggregatedTestConfig( + test_name="ds_v3_lite_tp1", + model_root="DeepSeek-V3-Lite/fp8", + global_config={ + "backend": "pytorch", + "free_gpu_memory_fraction": 0.1, + }, + ctx_config={ + "disable_overlap_scheduler": True, + }, + gen_config={ + "disable_overlap_scheduler": False, + }, + skip_hopper=True, +) + +_ds_v3_lite_4_gpus_cfg = DisaggregatedTestConfig( + test_name="ds_v3_lite", + model_root="DeepSeek-V3-Lite/fp8", + global_config={ + "backend": "pytorch", + "free_gpu_memory_fraction": 0.7, + }, + ctx_config={ + "tensor_parallel_size": 2, + "disable_overlap_scheduler": True, + }, + gen_config={ + "tensor_parallel_size": 2, + "disable_overlap_scheduler": False, + }, + skip_device_count=4, + skip_hopper=True, +) + +TEST_CONFIGS = [ + # TinyLlama tests - basic + _tiny_llama_cfg, + # Performance metrics variant - extends base with metrics config + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="perf_metrics", + global_config={"perf_metrics_max_requests": 1000}, + ctx_config={ + "return_perf_metrics": True, + "perf_metrics_max_requests": 1000 + }, + gen_config={ + "return_perf_metrics": True, + "perf_metrics_max_requests": 1000 + }, + extra_validation="perf_metrics", + ), + # KV cache time variant - same as perf_metrics but different validation + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="kv_cache_time_output", + global_config={"perf_metrics_max_requests": 1000}, + ctx_config={ + "return_perf_metrics": True, + "perf_metrics_max_requests": 1000 + }, + gen_config={ + "return_perf_metrics": True, + "perf_metrics_max_requests": 1000 + }, + extra_validation="kv_cache_time", + ), + # Create TRT variant from base - only need to override backend + DisaggregatedTestConfig(test_name="trt_backend", + model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + global_config={ + "backend": "trt", + "kv_cache_config": { + "free_gpu_memory_fraction": 0.2, + "enable_partial_reuse": False + }, + }), + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="diff_max_tokens", + prompt_file="long_prompts.json", + ctx_config={ + "max_num_tokens": 512, + "max_batch_size": 64 + }, + gen_config={ + "max_num_tokens": 256, + "max_batch_size": 32 + }, + ), + + # TinyLlama - CUDA graph + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="cuda_graph", + ctx_config={"cuda_graph_config": { + "batch_sizes": [1, 3000] + }}, + gen_config={ + "cuda_graph_config": { + "enable_padding": True, + "batch_sizes": [1, 4, 8, 16, 24, 32] + }, + "max_batch_size": 256, + "max_num_tokens": 4096, + "max_seq_len": 4096, + }, + ), + + # TinyLlama - overlap + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="overlap", + ctx_config={ + "max_num_tokens": 3000, + "max_seq_len": 4096, + "disable_overlap_scheduler": True, + }, + gen_config={ + "max_batch_size": 256, + "max_num_tokens": 4096, + "max_seq_len": 4096, + "disable_overlap_scheduler": False, + }, + ), + + # TinyLlama - mixed + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="mixed", + gen_config={"num_instances": 2}, + ), + + # TinyLlama - trtllm sampler + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="trtllm_sampler", + ctx_config={ + "max_batch_size": 1, + "max_num_tokens": 3000, + "max_seq_len": 4096, + "sampler_type": "TRTLLMSampler", + "disable_overlap_scheduler": True, + }, + gen_config={ + "max_batch_size": 256, + "max_num_tokens": 4096, + "max_seq_len": 4096, + "sampler_type": "TRTLLMSampler", + "disable_overlap_scheduler": False, + }, + ), + + # TinyLlama - load balance + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="load_balance", + global_config={ + "kv_cache_config": { + "free_gpu_memory_fraction": 0.15, + "enable_partial_reuse": False + }, + }, + ctx_config={ + "num_instances": 2, + "router": { + "type": "load_balancing", + "use_tokens": True + }, + "max_num_tokens": 3000, + "max_seq_len": 4096, + "disable_overlap_scheduler": True, + }, + gen_config={ + "num_instances": 2, + "router": { + "type": "load_balancing", + "use_tokens": False + }, + "max_batch_size": 256, + "max_num_tokens": 4096, + "max_seq_len": 4096, + "disable_overlap_scheduler": False, + }, + ), + + # TinyLlama - cache aware balance + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="cache_aware_balance", + global_config={ + "free_gpu_memory_fraction": 0.1, + "enable_autotuner": False, + "kv_cache_config": { + "enable_block_reuse": True, + "enable_partial_reuse": False, + "event_buffer_max_size": 1024, + "free_gpu_memory_fraction": 0.1 + }, + }, + ctx_config={ + "num_instances": 2, + "router": { + "type": "kv_cache_aware" + }, + "max_batch_size": 16, + "max_num_tokens": 3000, + "max_seq_len": 4096, + }, + gen_config={ + "num_instances": 2, + "router": { + "type": "kv_cache_aware" + }, + "max_batch_size": 256, + "max_num_tokens": 4096, + "max_seq_len": 4096, + }, + ), + + # TinyLlama - conditional + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="conditional", + model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + global_config={ + "free_gpu_memory_fraction": 0.15, + "conditional_disagg_config": { + "max_local_prefill_length": 100 + }, + "enable_autotuner": False, + "kv_cache_config": { + "enable_block_reuse": True, + "enable_partial_reuse": True, + "event_buffer_max_size": 1024, + "free_gpu_memory_fraction": 0.15 + }, + }, + gen_config={ + "router": { + "type": "kv_cache_aware" + }, + }, + ), + + # TinyLlama - ngram + DisaggregatedTestConfig.from_base( + _tiny_llama_cfg, + test_name="ngram", + global_config={ + "free_gpu_memory_fraction": 0.1, + }, + gen_config={ + "speculative_config": { + "decoding_type": "NGram", + "max_draft_len": 4, + "max_matching_ngram_size": 4, + "is_keep_all": True, + "is_use_oldest": True, + "is_public_pool": True + }, + }, + ), + DisaggregatedTestConfig( + test_name="gen_only_bs1", + model_root="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + env_vars={"TRTLLM_DISAGG_BENCHMARK_GEN_ONLY": "1"}, + global_config={ + "backend": "pytorch", + "cuda_graph_config": None, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.2, + "enable_partial_reuse": False, + }, + "enable_attention_dp": True, + }, + ctx_config={ + "tensor_parallel_size": 2, + "max_batch_size": 1, + "max_num_tokens": 3000, + "max_seq_len": 4096, + "disable_overlap_scheduler": True, + }, + gen_config={ + "tensor_parallel_size": 2, + "max_batch_size": 1, + "max_num_tokens": 4096, + "max_seq_len": 4096, + "disable_overlap_scheduler": False, + }, + skip_device_count=4, + ), + + # TinyLlama - TP variations + DisaggregatedTestConfig.from_base( + _tiny_llama_multi_gpus_cfg, + test_name="1ctx_tp2pp1_2gen_tp1pp1", + ctx_config={ + "tensor_parallel_size": 2, + }, + gen_config={ + "num_instances": 2, + }, + skip_device_count=4, + ), + DisaggregatedTestConfig.from_base( + _tiny_llama_multi_gpus_cfg, + test_name="1ctx_tp2pp1_2gen_tp1pp1_trt", + global_config={ + "backend": "trt", + }, + ctx_config={ + "tensor_parallel_size": 2, + }, + gen_config={ + "num_instances": 2, + }, + skip_device_count=4, + ), + DisaggregatedTestConfig.from_base( + _tiny_llama_multi_gpus_cfg, + test_name="1ctx_tp1pp2_1gen_tp1pp2", + ctx_config={ + "pipeline_parallel_size": 2, + }, + gen_config={ + "pipeline_parallel_size": 2, + }, + skip_device_count=4, + ), + DisaggregatedTestConfig.from_base( + _tiny_llama_multi_gpus_cfg, + test_name="1ctx_tp2pp1_1gen_tp1pp2", + ctx_config={ + "tensor_parallel_size": 2, + }, + gen_config={ + "pipeline_parallel_size": 2, + }, + skip_device_count=4, + ), + DisaggregatedTestConfig.from_base( + _tiny_llama_multi_gpus_cfg, + test_name="1ctx_tp1pp2_1gen_tp2pp1", + ctx_config={ + "pipeline_parallel_size": 2, + }, + gen_config={ + "tensor_parallel_size": 2, + }, + skip_device_count=4, + ), + DisaggregatedTestConfig.from_base( + _tiny_llama_multi_gpus_cfg, + test_name="1ctx_tp2pp2_1gen_tp2pp2", + ctx_config={ + "tensor_parallel_size": 2, + "pipeline_parallel_size": 2, + }, + gen_config={ + "tensor_parallel_size": 2, + "pipeline_parallel_size": 2, + }, + skip_device_count=8, + ), + DisaggregatedTestConfig.from_base( + _tiny_llama_multi_gpus_cfg, + test_name="1ctx_tp1pp4_1gen_tp1pp4", + ctx_config={ + "pipeline_parallel_size": 4, + }, + gen_config={ + "pipeline_parallel_size": 4, + }, + skip_device_count=8, + ), + DisaggregatedTestConfig.from_base( + _tiny_llama_multi_gpus_cfg, + test_name="1ctx_tp1pp4_1gen_tp4pp1", + ctx_config={ + "pipeline_parallel_size": 4, + }, + gen_config={ + "tensor_parallel_size": 4, + }, + skip_device_count=8, + ), + # DeepSeek V3 Lite tests + + # TP1 tests + _ds_v3_lite_tp1_cfg, + DisaggregatedTestConfig.from_base( + _ds_v3_lite_tp1_cfg, + test_name="ds_v3_lite_tp1_mtp", + global_config={ + "speculative_config": { + "decoding_type": "MTP", + "num_nextn_predict_layers": 1 + }, + }, + ctx_config={ + "enable_attention_dp": True, + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_tp1_cfg, + test_name="ds_v3_lite_tp1_mtp_adp_overlap", + global_config={ + "speculative_config": { + "decoding_type": "MTP", + "num_nextn_predict_layers": 1 + }, + "enable_attention_dp": True, + }, + ctx_config={ + "disable_overlap_scheduler": True, + }, + gen_config={ + "disable_overlap_scheduler": False, + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_tp1_cfg, + test_name="ds_v3_lite_tp1_mtp2", + global_config={ + "speculative_config": { + "decoding_type": "MTP", + "num_nextn_predict_layers": 2 + }, + }, + ctx_config={ + "enable_attention_dp": True, + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_tp1_cfg, + test_name="ds_v3_lite_tp1_cache_aware_balance", + global_config={ + "enable_autotuner": False, + "kv_cache_config": { + "enable_block_reuse": True + } + }, + ctx_config={ + "num_instances": 2, + "router": { + "type": "kv_cache_aware" + }, + }, + gen_config={ + "num_instances": 2, + "router": { + "type": "kv_cache_aware" + }, + }, + skip_hopper=True, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_tp1_cfg, + test_name="ds_v3_lite_tp1_conditional", + global_config={ + "enable_autotuner": False, + "conditional_disagg_config": { + "enable_conditional_generation": True + }, + "kv_cache_config": { + "event_buffer_max_size": 1024, + "free_gpu_memory_fraction": 0.15, + }, + }, + ctx_config={ + "router": { + "type": "kv_cache_aware" + }, + }, + gen_config={ + "router": { + "type": "kv_cache_aware" + }, + }, + ), + + # 4 ranks different backends + DisaggregatedTestConfig.from_base( + _ds_v3_lite_4_gpus_cfg, + test_name="ds_v3_lite_mpi", + global_config={ + "cache_transceiver_config": { + "backend": "MPI", + }, + }, + skip_arm_arch=True, + env_vars={ + "TRTLLM_USE_MPI_KVCACHE": "1", + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_4_gpus_cfg, + test_name="ds_v3_lite_ucx", + global_config={ + "cache_transceiver_config": { + "backend": "UCX", + }, + }, + skip_arm_arch=True, + env_vars={ + "TRTLLM_USE_UCX_KVCACHE": "1", + "UCX_TLS": "^ib" + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_4_gpus_cfg, + test_name="ds_v3_lite_nixl", + global_config={ + "cache_transceiver_config": { + "backend": "NIXL", + }, + }, + skip_arm_arch=True, + env_vars={ + "TRTLLM_USE_NIXL_KVCACHE": "1", + "UCX_TLS": "^ib" + }, + ), + # 4 ranks + _ds_v3_lite_4_gpus_cfg, + DisaggregatedTestConfig.from_base( + _ds_v3_lite_4_gpus_cfg, + test_name="ds_v3_lite_adp", + global_config={ + "enable_attention_dp": True, + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_4_gpus_cfg, + test_name="ds_v3_lite_adp_overlap", + global_config={ + "enable_attention_dp": True, + }, + ctx_config={ + "disable_overlap_scheduler": True, + }, + gen_config={ + "disable_overlap_scheduler": False, + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_4_gpus_cfg, + test_name="ds_v3_lite_adp_overlap_cuda_graph", + global_config={ + "enable_attention_dp": True, + }, + ctx_config={ + "disable_overlap_scheduler": True, + }, + gen_config={ + "cuda_graph_config": { + "enable_padding": False + }, + "disable_overlap_scheduler": False, + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_4_gpus_cfg, + test_name="ds_v3_lite_overlap_cuda_graph", + gen_config={ + "cuda_graph_config": { + "enable_padding": False + }, + "disable_overlap_scheduler": False, + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_4_gpus_cfg, + test_name="ds_v3_lite_adp_mtp", + global_config={ + "speculative_config": { + "decoding_type": "MTP", + "num_nextn_predict_layers": 1 + }, + "enable_attention_dp": True, + }, + ), + DisaggregatedTestConfig.from_base( + _ds_v3_lite_4_gpus_cfg, + test_name="ds_v3_lite_mtp", + global_config={ + "speculative_config": { + "decoding_type": "MTP", + "num_nextn_predict_layers": 1 + }, + }, + ), +] + + +def get_test_id(config: DisaggregatedTestConfig) -> str: + """Generate test ID from config.""" + return config.test_name + + +def apply_skip_marks(config: DisaggregatedTestConfig): + """Apply skip markers based on configuration.""" + markers = [] + + if config.skip_device_count is not None: + markers.append(pytest.mark.skip_less_device(config.skip_device_count)) + + return markers + + +def pytest_generate_tests(metafunc): + """Generate test cases dynamically based on TEST_CONFIGS.""" + if "config" in metafunc.fixturenames: + configs = TEST_CONFIGS + [get_test_id(c) for c in configs] + + # Apply marks + marked_configs = [] + for config in configs: + marks = apply_skip_marks(config) + if marks: + marked_configs.append( + pytest.param(config, marks=marks, id=get_test_id(config))) + else: + marked_configs.append( + pytest.param(config, id=get_test_id(config))) + + metafunc.parametrize("config", marked_configs) + + +def run_disaggregated_test_parametrized(example_dir, + config: DisaggregatedTestConfig, + env=None, + cwd=None, + extra_endpoints_test=None): + """Run disaggregated test with parametrized configuration. + + Args: + example_dir: Path to the examples directory + config: DisaggregatedTestConfig with all test parameters + env: Environment variables + cwd: Working directory for test execution + extra_endpoints_test: Optional callback for additional endpoint validation + """ + import subprocess + + from defs.trt_test_alternative import popen + + from tensorrt_llm._utils import mpi_disabled + from tensorrt_llm.logger import logger + + cleanup_output_files() + run_env = env.copy() + run_env["UCX_TLS"] = "^ib" + + # Generate config file + config_path = config.generate_yaml_config(cwd) + + # Print generated config for debugging + print(f"\n{'='*80}") + print(f"Generated YAML config for test: {config.test_name}") + print(f"Config file: {config_path}") + print(f"{'='*80}") + with open(config_path, 'r') as f: + print(f.read()) + print(f"{'='*80}\n") + + try: + num_ranks = config.get_num_ranks() + use_ray = mpi_disabled() + + if not use_ray: + workers_cmd = [ + 'mpirun', '--allow-run-as-root', '--oversubscribe', '-n', + str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker', + '-c', config_path + ] + else: + pytest.skip( + "https://nvbugs/5584607 Ray orchestrator is not supported with NIXL(DEFAULT) cache transceiver backend." + ) + # Check backend compatibility + backend = config.global_config.get('backend', 'pytorch') + if backend != "pytorch": + pytest.skip( + "Ray orchestrator is only supported with pytorch backend.") + + # Generate extra config files for Ray workers + def get_extra_llm_config(server_config, suffix): + extra_config = { + 'orchestrator_type': 'ray', + } + for key, value in server_config.items(): + if key not in ['num_instances', 'urls']: + extra_config[key] = value + return extra_config + + extra_config_files = [] + workers_cmds = [] + + # Create config for context servers + ctx_num_instances = config.ctx_config.get('num_instances', 1) + for i in range(ctx_num_instances): + extra_llm_config = get_extra_llm_config(config.ctx_config, + f'ctx_{i}') + extra_file = os.path.join(cwd, + f'{config.test_name}_ctx_{i}.yaml') + with open(extra_file, 'w') as f: + yaml.dump(extra_llm_config, f, default_flow_style=False) + extra_config_files.append(extra_file) + workers_cmds.append([ + 'trtllm-serve', 'disaggregated_ray_worker', '-c', + extra_file, '--model', config.model_root + ]) + + # Create config for generation servers + gen_num_instances = config.gen_config.get('num_instances', 1) + for i in range(gen_num_instances): + extra_llm_config = get_extra_llm_config(config.gen_config, + f'gen_{i}') + extra_file = os.path.join(cwd, + f'{config.test_name}_gen_{i}.yaml') + with open(extra_file, 'w') as f: + yaml.dump(extra_llm_config, f, default_flow_style=False) + extra_config_files.append(extra_file) + workers_cmds.append([ + 'trtllm-serve', 'disaggregated_ray_worker', '-c', + extra_file, '--model', config.model_root + ]) + + server_start_timeout = 1200 + server_cmd = [ + 'trtllm-serve', 'disaggregated', '--server_start_timeout', + str(server_start_timeout), '-c', config_path + ] + server_url = get_disagg_server_url_from_cfg(config_path) + + try: + if not use_ray: + with (open('output_workers.log', 'w') as output_workers, + popen(workers_cmd, + stdout=output_workers, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd) as + workers_proc, open('output_disagg.log', + 'w') as output_disagg, + popen(server_cmd, + stdout=output_disagg, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd) as server_proc): + run_client_tests(example_dir, + config_path, + config.test_name, + config.num_iters, + env, + server_start_timeout, + config.prompt_file, + extra_endpoints_test, + server_url, + workers_proc, + server_proc, + use_ray=False) + else: + # Ray orchestrator path + workers_proc = [] + for worker_cmd in workers_cmds: + workers_proc.append( + popen(worker_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd)) + + # Enter all worker contexts + for proc_cm in workers_proc: + proc_cm.__enter__() + + with (open('output_disagg.log', 'w') as output_disagg, + popen(server_cmd, + stdout=output_disagg, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd) as server_proc): + run_client_tests(example_dir, + config_path, + config.test_name, + config.num_iters, + env, + server_start_timeout, + config.prompt_file, + extra_endpoints_test, + server_url, + workers_proc, + server_proc, + use_ray=True) + except Exception: + logger.error("-------- Workers output --------") + if not use_ray and os.path.exists('output_workers.log'): + with open('output_workers.log', 'r') as f: + logger.error(f.read()) + + logger.error("-------- Disagg server output --------") + if os.path.exists('output_disagg.log'): + with open('output_disagg.log', 'r') as f: + logger.error(f.read()) + raise + finally: + if use_ray: + subprocess.run(['ray', 'stop', '--force'], check=False) + for extra_file in extra_config_files: + if os.path.exists(extra_file): + os.remove(extra_file) + else: + if 'server_proc' in locals() and 'workers_proc' in locals(): + server_proc.terminate() + workers_proc.terminate() + server_proc.wait() + workers_proc.wait() + finally: + # Cleanup generated config + if os.path.exists(config_path): + os.remove(config_path) + + +@pytest.fixture(scope="function") +def model_root_fixture(config, llm_venv, request): + """Fixture that provides the correct model root based on config.""" + from defs.conftest import llm_models_root + + models_root = llm_models_root() + + print("Running model root fixture for config: ", config.test_name) + if config.model_root == "TinyLlama/TinyLlama-1.1B-Chat-v1.0": + src_root = os.path.join(models_root, "llama-models-v2", + "TinyLlama-1.1B-Chat-v1.0") + else: + src_root = os.path.join(models_root, config.model_root) + + dst_root = f"{llm_venv.get_working_directory()}/{config.model_root}" + + # Create symlink + if not os.path.exists(dst_root) and not os.path.islink(dst_root): + os.makedirs(os.path.dirname(dst_root), exist_ok=True) + os.symlink(src_root, dst_root, target_is_directory=True) + + return src_root + + +def test_disagg( + config: DisaggregatedTestConfig, + disaggregated_test_root, + disaggregated_example_root, + llm_venv, + model_root_fixture, +): + """Parametrized test for all disaggregated configurations.""" + # Apply skip conditions that can't be marks + if config.skip_hopper: + skip_no_hopper() + + if config.skip_arm_arch: + skip_arm() + + # Setup environment + env = llm_venv._new_env.copy() + + # Handle special validation cases + extra_endpoints_test = None + kv_cache_output_path = None + + if config.extra_validation == "perf_metrics": + # Test /perf_metrics endpoint + def extra_endpoints_test(server_url: str): + import json + import urllib.request + + with urllib.request.urlopen(f"{server_url}/perf_metrics", + timeout=10) as resp: + assert resp.status == 200 + perf_metrics = json.load(resp) + assert len(perf_metrics) > 0 + item = perf_metrics[0] + + # Use helper function to validate all timing metrics comprehensively + validate_timing_metrics(item, "perf_metrics test") + + elif config.extra_validation == "kv_cache_time": + # Test KV cache time output files + kv_cache_output_path = os.path.join(llm_venv.get_working_directory(), + "cache_time") + env["TRTLLM_KVCACHE_TIME_OUTPUT_PATH"] = kv_cache_output_path + + # Apply test-specific environment variables + if config.env_vars: + env.update(config.env_vars) + + # Run the test + run_disaggregated_test_parametrized( + disaggregated_example_root, + config, + env=env, + cwd=llm_venv.get_working_directory(), + extra_endpoints_test=extra_endpoints_test) + + # Post-test validation for kv_cache_time + if config.extra_validation == "kv_cache_time": + assert os.path.isdir(kv_cache_output_path) + send_file = os.path.join(kv_cache_output_path, "rank_0_send.csv") + recv_file = os.path.join(kv_cache_output_path, "rank_1_recv.csv") + assert os.path.exists(send_file) + assert os.path.exists(recv_file) + with open(send_file, "r") as f: + lines = f.readlines() + assert len(lines) > 1 + assert lines[0].startswith( + "RequestID,RequestInfo,Preparation,Preprocess,Transmissions,Postprocess" + ) + assert ",Delay,Duration,Bandwidth(Gbps)" in lines[0] + # get a send sample and match the recv + sample = lines[1].split(',') + assert len(sample) >= 9 + with open(recv_file, "r") as f: + lines = f.readlines() + assert len(lines) > 1 + matched = False + for line in lines: + sample_recv = line.split(',') + if sample_recv[0] == sample[0]: + matched = True + break + assert matched diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 22bffa4c426..dce78c4e65c 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -763,26 +763,25 @@ examples/serve/test_serve_negative.py::test_extremely_large_batch # PyTorch flow disaggregated tests -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated_parametrized.py::test_disagg[2_ranks] +disaggregated/test_disaggregated_parametrized.py::test_disagg[perf_metrics] +disaggregated/test_disaggregated_parametrized.py::test_disagg[kv_cache_time_output] +disaggregated/test_disaggregated_parametrized.py::test_disagg[trt_backend] +disaggregated/test_disaggregated_parametrized.py::test_disagg[cuda_graph] +disaggregated/test_disaggregated_parametrized.py::test_disagg[load_balance] +disaggregated/test_disaggregated_parametrized.py::test_disagg[cache_aware_balance] +disaggregated/test_disaggregated_parametrized.py::test_disagg[trtllm_sampler] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mpi] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mtp] +disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp2_1gen_tp2pp2] +disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp4_1gen_tp1pp4] disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8] disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8] disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8] disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8] -disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_kv_cache_time_output[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index e6eb445bf88..38c1740f180 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -194,20 +194,17 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype -disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated_parametrized.py::test_disagg[2_ranks] +disaggregated/test_disaggregated_parametrized.py::test_disagg[cache_aware_balance] +disaggregated/test_disaggregated_parametrized.py::test_disagg[load_balance] +disaggregated/test_disaggregated_parametrized.py::test_disagg[cuda_graph] +disaggregated/test_disaggregated_parametrized.py::test_disagg[trt_backend] +disaggregated/test_disaggregated_parametrized.py::test_disagg[trtllm_sampler] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp_mtp] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mpi] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl] +disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx] disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 5fc56bd938f..3aa78d5e9d2 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -28,17 +28,18 @@ l0_a10: - unittest/disaggregated/test_remoteDictionary.py - unittest/disaggregated/test_disagg_cluster_manager_worker.py - unittest/disaggregated/test_cluster_storage.py - - disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_kv_cache_time_output[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_perf_metrics[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_conditional[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ngram[TinyLlama-1.1B-Chat-v1.0] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[2_ranks] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[trt_backend] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[cuda_graph] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[mixed] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[overlap] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[diff_max_tokens] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[kv_cache_time_output] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[perf_metrics] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[cache_aware_balance] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[conditional] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ngram] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[load_balance] - disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 0e216d4acce..2b35fe5e75e 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -41,7 +41,8 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto] @@ -49,9 +50,8 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2 - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 @@ -73,10 +73,10 @@ l0_dgx_b200: - unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu4" - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2_genpp2[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_gentp2[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_gentp4[TinyLlama-1.1B-Chat-v1.0] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp2_1gen_tp1pp2] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp1_1gen_tp1pp2] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp2_1gen_tp2pp1] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp4_1gen_tp4pp1] - examples/test_ray.py::test_llm_inference_distributed_ray[tp2pp2] - examples/test_ray.py::test_ray_disaggregated_serving[tp2] - condition: @@ -186,4 +186,4 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300.yml b/tests/integration/test_lists/test-db/l0_dgx_b300.yml index 82f78afbf98..b405261ce3a 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b300.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b300.yml @@ -59,18 +59,18 @@ l0_dgx_b300: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] TIMEOUT (180) diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 1518613c1d0..730427e8890 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -74,13 +74,13 @@ l0_dgx_h100: - test_e2e.py::test_ptp_quickstart_advanced_bs1 - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8] # ------------- Disaggregated serving tests --------------- - - disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2_genpp2[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_gentp2[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_gentp4[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp1_2gen_tp1pp1] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp1_2gen_tp1pp1_trt] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp2_1gen_tp1pp2] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp1_1gen_tp1pp2] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp2_1gen_tp2pp1] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp4_1gen_tp4pp1] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[gen_only_bs1] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1] @@ -144,20 +144,19 @@ l0_dgx_h100: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=2] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mpi] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_ucx] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_nixl] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp_overlap] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_mtp] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp_mtp] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_adp_overlap_cuda_graph] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_overlap_cuda_graph] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_mtp_adp_overlap] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_cache_aware_balance] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_conditional] - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp[DeepSeek-V3-Lite-fp8] - disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] - condition: diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index 05935956a2b..d798b354761 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -31,8 +31,8 @@ l0_dgx_h200: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] - accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] - accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp2pp2_1gen_tp2pp2] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[1ctx_tp1pp4_1gen_tp1pp4] - unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora - condition: ranges: @@ -119,10 +119,10 @@ l0_dgx_h200: - test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b] - test_e2e.py::test_trtllm_bench_mgmn - unittest/_torch/multi_gpu -m "post_merge" TIMEOUT (90) - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] + - disaggregated/test_disaggregated_benchmark.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] # ------------- AutoDeploy tests --------------- - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] - condition: diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 3b7d94d38c3..d0ba7f1fbd9 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -72,11 +72,10 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=2] - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False] - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu[DeepSeek-V3-Lite-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_mtp] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[ds_v3_lite_tp1_mtp2] + - disaggregated/test_disaggregated_parametrized.py::test_disagg[load_balance] - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[False-False-DeepSeek-V3-Lite-fp8/fp8] - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[False-True-DeepSeek-V3-Lite-fp8/fp8] - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-False-DeepSeek-V3-Lite-fp8/fp8]