diff --git a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh index 040632249d34..d0a56304f2a6 100755 --- a/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh @@ -21,19 +21,20 @@ dp_ep_configs=( "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP2, D-DPEP=2 (TP=1) ) +# We assume HMA enabled by default. hybrid_ssm_configs=( - "VLLM_SSM_CONV_STATE_LAYOUT=DS ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=ibm-granite/granite-4.0-h-tiny VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192,--trust-remote-code" + "VLLM_SSM_CONV_STATE_LAYOUT=DS GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=ibm-granite/granite-4.0-h-tiny VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192,--trust-remote-code" # TODO: (NickLucche) Address async scheduling issue with TP>1 separately as this may impact other models. - "VLLM_SSM_CONV_STATE_LAYOUT=DS ENABLE_HMA_FLAG=1 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=ibm-granite/granite-4.0-h-tiny VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192,--trust-remote-code,--no-async-scheduling" + "VLLM_SSM_CONV_STATE_LAYOUT=DS PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=ibm-granite/granite-4.0-h-tiny VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192,--trust-remote-code,--no-async-scheduling" # GDN (Qwen3.5) - "VLLM_SSM_CONV_STATE_LAYOUT=DS ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=Qwen/Qwen3.5-0.8B" - "VLLM_SSM_CONV_STATE_LAYOUT=DS ENABLE_HMA_FLAG=1 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=Qwen/Qwen3.5-0.8B VLLM_SERVE_EXTRA_ARGS=--no-async-scheduling" + "VLLM_SSM_CONV_STATE_LAYOUT=DS GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=Qwen/Qwen3.5-0.8B" + "VLLM_SSM_CONV_STATE_LAYOUT=DS PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=Qwen/Qwen3.5-0.8B VLLM_SERVE_EXTRA_ARGS=--no-async-scheduling" ) sw_attn_configs=( # NOTE: gemma3 does not work with FlashInfer "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192" # SW model - "ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192" - "ENABLE_HMA_FLAG=1 GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192" + "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192" + "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192" ) # Select config array based on DP_EP env var @@ -50,14 +51,6 @@ else configs=("${tp_configs[@]}") fi -if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then - # Append ENABLE_HMA_FLAG=1 to each config in the selected array - echo "ENABLE_HMA_FLAG is set, appending ENABLE_HMA_FLAG=1 to each config" - for i in "${!configs[@]}"; do - configs[$i]="ENABLE_HMA_FLAG=1 ${configs[$i]}" - done -fi - run_tests() { local label=$1 local extra_args=$2 diff --git a/tests/v1/kv_connector/nixl_integration/config_sweep_spec_decode_test.sh b/tests/v1/kv_connector/nixl_integration/config_sweep_spec_decode_test.sh index 313efc3968dd..f55bd308a0a6 100755 --- a/tests/v1/kv_connector/nixl_integration/config_sweep_spec_decode_test.sh +++ b/tests/v1/kv_connector/nixl_integration/config_sweep_spec_decode_test.sh @@ -11,7 +11,7 @@ SCRIPT="v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh" eagle3_config="SD_METHOD=eagle3 MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct SD_MODEL=RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3 NUM_SPEC_TOKENS=3" # MTP: Qwen3.5-0.8B-Base with hybrid SSM flags. -mtp_config="SD_METHOD=mtp MODEL_NAME=Qwen/Qwen3.5-0.8B-Base SD_MODEL=Qwen/Qwen3.5-0.8B-Base NUM_SPEC_TOKENS=1 BLOCK_SIZE=32 MAX_MODEL_LEN=4096 VLLM_SSM_CONV_STATE_LAYOUT=DS ENABLE_HMA_FLAG=1 KV_BUFFER_DEVICES=cuda" +mtp_config="SD_METHOD=mtp MODEL_NAME=Qwen/Qwen3.5-0.8B-Base SD_MODEL=Qwen/Qwen3.5-0.8B-Base NUM_SPEC_TOKENS=1 BLOCK_SIZE=32 MAX_MODEL_LEN=4096 VLLM_SSM_CONV_STATE_LAYOUT=DS KV_BUFFER_DEVICES=cuda" configs=( "$eagle3_config" diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index fc446a0e7658..bde246c9b661 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -5,11 +5,6 @@ set -xe KV_BUFFER_DEVICE="cuda" # Default to cuda ATTENTION_BACKEND="" # Default to empty (use vllm default) CROSS_LAYERS_BLOCKS="False" -ENABLE_HMA_VAR="" # Default to empty (HMA disabled by default for kv connector) -# Check for ENABLE_HMA_FLAG environment variable -if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then - ENABLE_HMA_VAR="--no-disable-hybrid-kv-cache-manager" -fi while [[ $# -gt 0 ]]; do case $1 in @@ -37,9 +32,6 @@ echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE" if [[ -n "$ATTENTION_BACKEND" ]]; then echo "Using attention backend: $ATTENTION_BACKEND" fi -if [[ -n "$ENABLE_HMA_VAR" ]]; then - echo "HMA (Hybrid KV Cache Manager) enabled" -fi if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then echo "vLLM serve extra args: $VLLM_SERVE_EXTRA_ARGS" fi @@ -180,10 +172,6 @@ run_tests_for_model() { BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND" fi - # Add HMA flag if specified - if [[ -n "$ENABLE_HMA_VAR" ]]; then - BASE_CMD="${BASE_CMD} $ENABLE_HMA_VAR" - fi FULL_CMD="$BASE_CMD" eval "$FULL_CMD &" @@ -232,10 +220,6 @@ run_tests_for_model() { BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND" fi - # Add HMA flag if specified - if [[ -n "$ENABLE_HMA_VAR" ]]; then - BASE_CMD="${BASE_CMD} $ENABLE_HMA_VAR" - fi # DP-EP attention mode if [[ -z "$DP_EP" ]]; then diff --git a/tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh b/tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh index 2c5622a2f0e1..bc90680a5334 100755 --- a/tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh +++ b/tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh @@ -27,7 +27,6 @@ # ROCM_AITER_UNIFIED_ATTN # NVIDIA options: FLASH_ATTN, FLASHINFER # VLLM_SSM_CONV_STATE_LAYOUT - SSM conv state layout (e.g. "DS" required for Mamba models) -# ENABLE_HMA_FLAG - set to 1 to enable hybrid KV cache manager # VLLM_SERVE_EXTRA_ARGS - comma-separated extra args for vllm serve set -ex @@ -85,13 +84,7 @@ if [[ -z "${ATTENTION_BACKEND:-}" ]]; then fi echo "Using attention backend: ${ATTENTION_BACKEND}" -# ── HMA & extra serve args ──────────────────────────────────────────── - -ENABLE_HMA_VAR="" -if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then - ENABLE_HMA_VAR="--no-disable-hybrid-kv-cache-manager" - echo "HMA (Hybrid KV Cache Manager) enabled" -fi +# ── Extra serve args ───────────────────────────────────────────────── EXTRA_SERVE_ARGS=() if [[ -n "${VLLM_SERVE_EXTRA_ARGS:-}" ]]; then @@ -258,7 +251,6 @@ run_test_for_device() { --kv-transfer-config "$kv_config" \ --speculative-config "$PREFILL_SPEC_CONFIG" \ --attention-backend $ATTENTION_BACKEND \ - ${ENABLE_HMA_VAR} \ ${EXTRA_SERVE_ARGS[@]+"${EXTRA_SERVE_ARGS[@]}"} & local SERVER_PID=$! @@ -298,7 +290,6 @@ run_test_for_device() { --kv-transfer-config "$kv_config" \ --speculative-config "$DECODE_SPEC_CONFIG" \ --attention-backend $ATTENTION_BACKEND \ - ${ENABLE_HMA_VAR} \ ${EXTRA_SERVE_ARGS[@]+"${EXTRA_SERVE_ARGS[@]}"} & local SERVER_PID=$! diff --git a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py index 6d4e6565e373..8d54353f82a8 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py @@ -386,8 +386,6 @@ def test_fewer_blocks_with_hma(monkeypatch, model_name, sw_size): "kv_transfer_config": kv_transfer_config, "max_model_len": 2048, "max_num_seqs": 1, - # NOTE: Make sure HMA is enabled - "disable_hybrid_kv_cache_manager": False, "max_num_batched_tokens": 2048, "enable_prefix_caching": False, "block_size": block_size,