Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a54fc74
is_null instead of 0 check
NickLucche Jan 7, 2026
c664dbf
get_sw_clippped_blocks to fix over-allocation for swa on D
NickLucche Jan 7, 2026
f284578
fix issue with null blocks on P being one extra (17) by clipping
NickLucche Jan 7, 2026
2e9e384
remove llama4 opt
NickLucche Jan 7, 2026
c1234f0
supportshma + scheduler change
NickLucche Jan 7, 2026
8cfd981
partial prefix cache hit + block_size_ratio + signatures
NickLucche Jan 12, 2026
06d2669
block failure handling + block_ratio handling + remove old request_fi…
NickLucche Jan 12, 2026
7198bec
update tests
NickLucche Jan 12, 2026
08c55dc
fix issue for heterogenuous block_size and layout
xuechendi Jan 13, 2026
6ec65ba
cpu-buffer case+precommit
NickLucche Jan 13, 2026
cf9c2e5
failure logging for hma
NickLucche Jan 13, 2026
d9cec70
hma e2e lm-eval test
NickLucche Jan 13, 2026
b22efd7
enable hma on all configs opt
NickLucche Feb 5, 2026
41122ab
request-level failure for hma
NickLucche Feb 6, 2026
3602394
add request-level failure tests
NickLucche Feb 6, 2026
0b48167
micro-opt for sw clip
NickLucche Feb 6, 2026
33bb65e
account for window across blocks
NickLucche Feb 20, 2026
b6870bc
revert all sched changes
NickLucche Feb 27, 2026
036af11
disable failure recovery
NickLucche Mar 2, 2026
a1ddbf6
fix
NickLucche Mar 2, 2026
9d08e75
missing sched changes
NickLucche Mar 2, 2026
380d543
rebase cruft
NickLucche Mar 2, 2026
b29597b
revert invalid block changes
NickLucche Mar 2, 2026
de7a452
update tests
NickLucche Mar 2, 2026
72a709d
revert sched changes
NickLucche Mar 5, 2026
dde50e0
precommit
NickLucche Mar 5, 2026
5ee9c4c
Merge branch 'main' into nixl-hma-rebase-no-recovery
NickLucche Mar 5, 2026
70f929e
cruft
NickLucche Mar 5, 2026
f9c31f3
max model len gemma
NickLucche Mar 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ tp_configs=(
"GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
"GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
"GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
"GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=google/gemma-3-4b-it VLLM_SERVE_EXTRA_ARGS=--max-model-len,8192" # SW model
)
dp_ep_configs=(
"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1)
Expand All @@ -26,6 +27,14 @@ else
configs=("${tp_configs[@]}")
fi

if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then
# Append ENABLE_HMA_FLAG=1 to each config in the selected array
echo "ENABLE_HMA_FLAG is set, appending ENABLE_HMA_FLAG=1 to each config"
for i in "${!configs[@]}"; do
configs[$i]="ENABLE_HMA_FLAG=1 ${configs[$i]}"
done
fi

run_tests() {
local label=$1
local extra_args=$2
Expand Down
37 changes: 36 additions & 1 deletion tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ set -xe
KV_BUFFER_DEVICE="cuda" # Default to cuda
ATTENTION_BACKEND="" # Default to empty (use vllm default)
CROSS_LAYERS_BLOCKS="False"
ENABLE_HMA_VAR="" # Default to empty (HMA disabled by default for kv connector)
# Check for ENABLE_HMA_FLAG environment variable
if [[ -n "${ENABLE_HMA_FLAG:-}" ]]; then
ENABLE_HMA_VAR="--no-disable-hybrid-kv-cache-manager"
fi

while [[ $# -gt 0 ]]; do
case $1 in
--kv_buffer_device)
Expand All @@ -31,6 +37,12 @@ echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
if [[ -n "$ATTENTION_BACKEND" ]]; then
echo "Using attention backend: $ATTENTION_BACKEND"
fi
if [[ -n "$ENABLE_HMA_VAR" ]]; then
echo "HMA (Hybrid KV Cache Manager) enabled"
fi
if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then
echo "vLLM serve extra args: $VLLM_SERVE_EXTRA_ARGS"
fi

DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
Expand Down Expand Up @@ -70,6 +82,8 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-128}
DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128}
# Comma-separated extra args for vllm serve (e.g. --max-model-len,2048)
VLLM_SERVE_EXTRA_ARGS=${VLLM_SERVE_EXTRA_ARGS:-}

# Find the git repository root directory
GIT_ROOT=$(git rev-parse --show-toplevel)
Expand Down Expand Up @@ -151,14 +165,24 @@ run_tests_for_model() {
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--tensor-parallel-size $PREFILLER_TP_SIZE \
--kv-transfer-config '$KV_CONFIG'"
if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then
IFS=',' read -r -a extra_args <<< "$VLLM_SERVE_EXTRA_ARGS"
for arg in "${extra_args[@]}"; do
BASE_CMD="${BASE_CMD} $arg"
done
fi

# Add attention backend config if specified
if [[ -n "$ATTENTION_BACKEND" ]]; then
BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
fi

# Add HMA flag if specified
if [[ -n "$ENABLE_HMA_VAR" ]]; then
BASE_CMD="${BASE_CMD} $ENABLE_HMA_VAR"
fi

FULL_CMD="$BASE_CMD"

eval "$FULL_CMD &"

# Store host and port for proxy configuration
Expand Down Expand Up @@ -193,12 +217,23 @@ run_tests_for_model() {
--block-size ${DECODE_BLOCK_SIZE} \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--kv-transfer-config '$KV_CONFIG'"
if [[ -n "$VLLM_SERVE_EXTRA_ARGS" ]]; then
IFS=',' read -r -a extra_args <<< "$VLLM_SERVE_EXTRA_ARGS"
for arg in "${extra_args[@]}"; do
BASE_CMD="${BASE_CMD} $arg"
done
fi

# Add attention backend config if specified
if [[ -n "$ATTENTION_BACKEND" ]]; then
BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
fi

# Add HMA flag if specified
if [[ -n "$ENABLE_HMA_VAR" ]]; then
BASE_CMD="${BASE_CMD} $ENABLE_HMA_VAR"
fi

# DP-EP attention mode
if [[ -z "$DP_EP" ]]; then
BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE"
Expand Down
1 change: 1 addition & 0 deletions tests/v1/kv_connector/nixl_integration/test_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"deepseek-ai/deepseek-vl2-small": 0.59,
"deepseek-ai/deepseek-vl2-tiny": 0.19,
"deepseek-ai/DeepSeek-V2-Lite-Chat": 0.65,
"google/gemma-3-4b-it": 0.74,
}

SIMPLE_PROMPT = (
Expand Down
Loading