diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml new file mode 100644 index 000000000000..fa05e2247d1e --- /dev/null +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -0,0 +1,111 @@ +group: Model Runner V2 +depends_on: + - image-build +steps: +- label: Model Runner V2 Core Tests + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/worker/gpu_worker.py + - vllm/v1/core/sched/ + - vllm/v1/attention/ + - tests/v1/engine/test_llm_engine.py + - tests/v1/e2e/ + - tests/v1/entrypoints/llm/test_struct_output_generate.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics" + # This requires eager until we sort out CG correctness issues. + # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged. + - ENFORCE_EAGER=1 pytest -v -s v1/e2e/test_async_scheduling.py -k "not ngram" + - pytest -v -s v1/e2e/test_context_length.py + - pytest -v -s v1/e2e/test_min_tokens.py + # Temporary hack filter to exclude ngram spec decoding based tests. + - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" + +- label: Model Runner V2 Examples + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/core/sched/ + - vllm/v1/worker/gpu_worker.py + - examples/offline_inference/ + - examples/basic/offline_inference/ + - examples/pooling/embed/vision_embedding_offline.py + - examples/others/tensorize_vllm_model.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + - pip install tensorizer # for tensorizer test + - python3 basic/offline_inference/chat.py # for basic + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 # TODO + #- python3 basic/offline_inference/embed.py # TODO + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + # TODO: uncomment once https://github.com/vllm-project/vllm/pull/35790 is merged. + #- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 # TODO + # for pooling models + - python3 pooling/embed/vision_embedding_offline.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + +- label: Model Runner V2 Distributed (2 GPUs) + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/tests" + num_devices: 2 + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/worker/gpu_worker.py + - tests/basic_correctness/test_basic_correctness.py + - tests/v1/distributed/test_async_llm_dp.py + - tests/v1/distributed/test_eagle_dp.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported. + - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True" + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray" + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + +# These require fix https://github.com/vllm-project/vllm/pull/36280 +- label: Model Runner V2 Pipeline Parallelism (4 GPUs) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/worker/gpu_worker.py + - tests/distributed/test_pipeline_parallel.py + #- tests/distributed/test_pp_cudagraph.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba" + # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged. + #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray" + +- label: Model Runner V2 Spec Decode + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/worker/gpu_worker.py + - tests/v1/spec_decode/test_max_len.py + - tests/v1/e2e/test_spec_decode.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp" + - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle or mtp"