diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index bad2796266a7..76207e5b368e 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -460,7 +460,7 @@ steps: - tests/lora - vllm/platforms/rocm.py commands: - - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py + - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_qwen3_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py #------------------------------------------------------ mi250 · model_executor -------------------------------------------------------# @@ -880,7 +880,7 @@ steps: - vllm/platforms/rocm.py commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh + - ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh - label: V1 e2e (2 GPUs) # TBD timeout_in_minutes: 180 @@ -929,6 +929,7 @@ steps: - tests/tokenizers_ - tests/reasoning - tests/tool_parsers + - tests/parser - tests/transformers_utils - tests/config commands: @@ -942,6 +943,7 @@ steps: - pytest -v -s tokenizers_ - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py - pytest -v -s tool_parsers + - pytest -v -s parser - pytest -v -s transformers_utils - pytest -v -s config @@ -1100,13 +1102,13 @@ steps: - vllm/compilation/ - vllm/model_executor/layers - tests/compile/passes/distributed/ + - tests/compile/fusions_e2e/ - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py - - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py - - pytest -v -s tests/compile/passes/distributed/test_tp2_ar_rms.py::test_tp2_ar_rms_fusions + - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py::test_tp2_ar_rms_fusions #----------------------------------------------------------- mi300 · cuda ------------------------------------------------------------# @@ -1320,7 +1322,6 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - - pytest -v -s entrypoints/openai/speech_to_text/ - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration (API Server openai - Part 3) # TBD @@ -1336,7 +1337,21 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py + +- label: Entrypoints Integration (Speech to Text) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] + agent_pool: mi300_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/speech_to_text + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/speech_to_text - label: Entrypoints Integration (LLM) # TBD timeout_in_minutes: 180 @@ -1760,7 +1775,7 @@ steps: - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_qwen3_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - pytest -v -s -x lora/test_gptoss_tp.py - pytest -v -s -x lora/test_qwen35_densemodel_lora.py @@ -1803,9 +1818,10 @@ steps: - tests/models/multimodal/generation - tests/models/multimodal/test_mapping.py commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py - - pytest -v -s models/multimodal/test_mapping.py + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@rocm-7.0-v2.3.0' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + - label: Multi-Modal Models (Extended Generation 2) # TBD timeout_in_minutes: 180 @@ -1817,8 +1833,10 @@ steps: - vllm/ - tests/models/multimodal/generation commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@rocm-7.0-v2.3.0' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + - label: Multi-Modal Models (Extended Generation 3) # TBD timeout_in_minutes: 180 @@ -2763,7 +2781,6 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - - pytest -v -s entrypoints/openai/speech_to_text/ - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration (API Server openai - Part 3) # TBD @@ -2779,7 +2796,21 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py + +- label: Entrypoints Integration (Speech to Text) # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi355] + agent_pool: mi355_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/speech_to_text + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/speech_to_text - label: Entrypoints Integration (Pooling) # TBD timeout_in_minutes: 180 @@ -3043,7 +3074,7 @@ steps: - vllm/ - tests/models/language/generation commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@rocm-7.0-v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' @@ -3318,7 +3349,7 @@ steps: - vllm/platforms/rocm.py commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh + - ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh - label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD timeout_in_minutes: 180 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index ba92d3a3aec0..f9ddf2603085 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -11,7 +11,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text - label: Entrypoints Integration (LLM) key: entrypoints-integration-llm @@ -44,7 +44,6 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py - - label: Entrypoints Integration (API Server openai - Part 2) key: entrypoints-integration-api-server-openai-part-2 timeout_in_minutes: 50 @@ -55,7 +54,6 @@ steps: - tests/entrypoints/test_chat_utils commands: - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - - pytest -v -s entrypoints/openai/speech_to_text/ - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration (API Server openai - Part 3) @@ -69,7 +67,7 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py - label: Entrypoints Integration (API Server 2) key: entrypoints-integration-api-server-2 @@ -86,6 +84,17 @@ steps: - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use +- label: Entrypoints Integration (Speech to Text) + key: entrypoints-integration-speech_to_text + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/speech_to_text + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/speech_to_text + - label: Entrypoints Integration (Pooling) key: entrypoints-integration-pooling timeout_in_minutes: 50 @@ -115,5 +124,5 @@ steps: - csrc/ - vllm/entrypoints/openai/ - vllm/model_executor/models/whisper.py - commands: # LMEval+Transcription WER check + commands: # LMEval - pytest -s entrypoints/openai/correctness/ diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml index f540eb2fcc2a..8107f9b37ff0 100644 --- a/.buildkite/test_areas/lora.yaml +++ b/.buildkite/test_areas/lora.yaml @@ -9,7 +9,7 @@ steps: - vllm/lora - tests/lora commands: - - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py + - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_qwen3_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemodel_lora.py parallelism: 4 @@ -19,6 +19,7 @@ steps: num_devices: 4 source_file_dependencies: - vllm/lora + - vllm/model_executor/layers/fused_moe/ - tests/lora commands: # FIXIT: find out which code initialize cuda before running the test @@ -30,7 +31,7 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_qwen3_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - pytest -v -s -x lora/test_gptoss_tp.py - pytest -v -s -x lora/test_qwen35_densemodel_lora.py \ No newline at end of file diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index da04c18017db..2a78201a9e47 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -210,6 +210,7 @@ steps: - label: Python-only Installation key: python-only-installation depends_on: ~ + optional: true timeout_in_minutes: 20 source_file_dependencies: - tests/standalone_tests/python_only_compile.sh @@ -282,6 +283,7 @@ steps: - tests/tokenizers_ - tests/reasoning - tests/tool_parsers + - tests/parser - tests/transformers_utils - tests/config device: cpu-small @@ -296,6 +298,7 @@ steps: - pytest -v -s tokenizers_ - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py - pytest -v -s tool_parsers + - pytest -v -s parser - pytest -v -s transformers_utils - pytest -v -s config diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index a20c5e7e9dce..44cf10076ee7 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -6,8 +6,8 @@ /vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery @xuechendi /vllm/lora @jeejeelee /vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni -/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety -/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety +/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety @zyongye +/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety @zyongye /vllm/model_executor/layers/mamba @tdoublep @tomeras91 /vllm/model_executor/layers/mamba/gdn_linear_attn.py @tdoublep @ZJY0516 @vadiklyutiy /vllm/model_executor/layers/rotary_embedding.py @vadiklyutiy @@ -18,7 +18,8 @@ /vllm/kernels/helion @ProExpertProg @zou3519 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa /vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni -CMakeLists.txt @tlrmchlsmth @LucasWilkinson +/CMakeLists.txt @tlrmchlsmth @LucasWilkinson @Harry-Chen +/cmake @tlrmchlsmth @LucasWilkinson @Harry-Chen # Any change to the VllmConfig changes can have a large user-facing impact, # so spam a lot of people @@ -70,6 +71,10 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1/worker/gpu @WoosukKwon @njhill /vllm/v1/worker/gpu/kv_connector.py @orozery +# CI & building +/.buildkite @Harry-Chen +/docker/Dockerfile @Harry-Chen + # Test ownership /.buildkite/lm-eval-harness @mgoin /tests/distributed/test_multi_node_assignment.py @youkaichao @@ -77,11 +82,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/distributed/test_same_node.py @youkaichao /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche /tests/evals @mgoin @vadiklyutiy -/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 +/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 @zyongye /tests/kernels/ir @ProExpertProg @tjtanaa /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche -/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety +/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety @zyongye /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm @@ -147,6 +152,12 @@ mkdocs.yaml @hmellor # MTP-specific files /vllm/model_executor/models/deepseek_mtp.py @luccafong +# DeepseekV4-specific files +/vllm/v1/attention/ops/deepseek_v4_ops @zyongye +/vllm/model_executor/layers/deepseek_compressor.py @zyongye +/vllm/model_executor/layers/deepseek_v4_attention.py @zyongye +/vllm/model_executor/layers/sparse_attn_indexer.py @zyongye + # Mistral-specific files /vllm/model_executor/models/mistral*.py @patrickvonplaten /vllm/model_executor/models/mixtral*.py @patrickvonplaten diff --git a/CMakeLists.txt b/CMakeLists.txt index 13788fa87437..fd6c7eeffd06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,8 +13,12 @@ cmake_minimum_required(VERSION 3.26) # cmake --install . --component _C project(vllm_extensions LANGUAGES CXX) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_STANDARD 20) +set(CMAKE_CUDA_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 20) +set(CMAKE_HIP_STANDARD_REQUIRED ON) # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) @@ -105,6 +109,24 @@ else() set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.7;8.9;9.0") endif() +# +# spinloop extension (pure CXX; must stay above the non-CUDA device branch so +# CPU builds define the target before the early return) +# +set(VLLM_SPINLOOP_EXT_SRC "csrc/spinloop.cpp") +set(SPINLOOP_COMPILE_FLAGS "") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") + list(APPEND SPINLOOP_COMPILE_FLAGS "-mmwaitx") +endif() +define_extension_target( + spinloop + DESTINATION vllm + LANGUAGE CXX + SOURCES ${VLLM_SPINLOOP_EXT_SRC} + COMPILE_FLAGS ${SPINLOOP_COMPILE_FLAGS} + USE_SABI 3.11 + WITH_SOABI) + # # Forward the non-CUDA device extensions to external CMake scripts. # diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index d27a5ea93dea..361f08b51054 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -1,7 +1,7 @@ include(FetchContent) set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_EXTENSIONS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/cmake/external_projects/deepgemm.cmake b/cmake/external_projects/deepgemm.cmake index 0d7ea43fb7d0..07328c271388 100644 --- a/cmake/external_projects/deepgemm.cmake +++ b/cmake/external_projects/deepgemm.cmake @@ -76,7 +76,6 @@ if(DEEPGEMM_ARCHS) "${deepgemm_SOURCE_DIR}/third-party/fmt/include") target_compile_options(_deep_gemm_C PRIVATE - $<$:-std=c++17> $<$:-O3> $<$:-Wno-psabi> $<$:-Wno-deprecated-declarations>) diff --git a/csrc/cache.h b/csrc/cache.h index 821d5e719a44..a9e74b0dc2df 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -12,7 +12,8 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst, void swap_blocks_batch(const torch::Tensor& src_ptrs, const torch::Tensor& dst_ptrs, - const torch::Tensor& sizes); + const torch::Tensor& sizes, + bool is_src_access_order_any); void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 895490f45a79..9130dd2ccae7 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -77,7 +77,8 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst, void swap_blocks_batch(const torch::Tensor& src_ptrs, const torch::Tensor& dst_ptrs, - const torch::Tensor& sizes) { + const torch::Tensor& sizes, + bool is_src_access_order_any) { TORCH_CHECK(src_ptrs.device().is_cpu(), "src_ptrs must be on CPU"); TORCH_CHECK(dst_ptrs.device().is_cpu(), "dst_ptrs must be on CPU"); TORCH_CHECK(sizes.device().is_cpu(), "sizes must be on CPU"); @@ -124,7 +125,12 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs, if (batch_fn != nullptr) { CUmemcpyAttributes attr = {}; - attr.srcAccessOrder = CU_MEMCPY_SRC_ACCESS_ORDER_STREAM; + // ANY lets the DMA engine prefetch source bytes out of stream order, + // which is only safe when no GPU stream is concurrently writing the + // source. + attr.srcAccessOrder = is_src_access_order_any + ? CU_MEMCPY_SRC_ACCESS_ORDER_ANY + : CU_MEMCPY_SRC_ACCESS_ORDER_STREAM; size_t attrs_idx = 0; size_t fail_idx = 0; CUresult result = batch_fn(reinterpret_cast(dst_data), diff --git a/csrc/cache_kernels_fused.cu b/csrc/cache_kernels_fused.cu index be037b2fdec2..8687ebe1f14c 100644 --- a/csrc/cache_kernels_fused.cu +++ b/csrc/cache_kernels_fused.cu @@ -21,28 +21,33 @@ namespace vllm { // NOTE Be EXTRA careful with raw_kv_scalar_t, for __half and __nv_bfloat16 it's // using u16 as the backing type. -template +template __global__ void concat_and_cache_mla_rope_fused_kernel( const int64_t* __restrict__ positions, // [num_tokens] qk_t* __restrict__ q_pe, // [num_tokens, num_q_heads, rot_dim] qk_t* __restrict__ k_pe, // [num_tokens, rot_dim] const qk_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank] - const qk_t* __restrict__ rope_cos_sin_cache, // [max_position, 2, - // rot_dim // 2] + const cos_sin_t* __restrict__ rope_cos_sin_cache, // [max_position, 2, + // rot_dim // 2] const int rot_dim, const int64_t q_pe_stride_token, const int64_t q_pe_stride_head, const int64_t k_pe_stride, const int64_t kv_c_stride, const int num_q_heads, cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank + // rot_dim)] - const int64_t* __restrict__ kv_cache_slot_mapping, // [num_tokens] + const int64_t* __restrict__ slot_mapping, // [num_tokens] const int block_stride, const int entry_stride, const int kv_lora_rank, const int block_size, const float* kv_cache_quant_scale) { // Each thread block is responsible for one token. const int64_t token_idx = blockIdx.x; + const int64_t slot_idx = slot_mapping[token_idx]; + // NOTE: slot_idx can be -1 if the token is padded + if (slot_idx < 0) { + return; + } const int64_t pos = positions[token_idx]; - const qk_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim; + const cos_sin_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim; const int embed_dim = rot_dim / 2; @@ -54,8 +59,8 @@ __global__ void concat_and_cache_mla_rope_fused_kernel( // NOTE: Would be nice to have interleaved sin/cos so we could just load // both at the same time. - qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx); - qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim); + qk_t cos = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx)); + qk_t sin = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim)); qk_t* q_pe_head_ptr = q_pe + token_idx * q_pe_stride_token + head_idx * q_pe_stride_head; @@ -81,21 +86,15 @@ __global__ void concat_and_cache_mla_rope_fused_kernel( q_pe_head_ptr[pair_idx_y] = y_dst; } - const int64_t slot_idx = kv_cache_slot_mapping[token_idx]; const int64_t block_idx = slot_idx / block_size; const int64_t entry_idx = slot_idx % block_size; - // NOTE: slot_idx can be -1 if the token is padded - if (slot_idx < 0) { - return; - } - // K with 1 HEAD for (int i = threadIdx.x; i < embed_dim; i += blockDim.x) { int pair_idx = i; - qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx); - qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim); + qk_t cos = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx)); + qk_t sin = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim)); qk_t* k_pe_head_ptr = k_pe + token_idx * k_pe_stride; @@ -165,36 +164,43 @@ __global__ void concat_and_cache_mla_rope_fused_kernel( } // namespace vllm -#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE) \ - do { \ - VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] { \ - using qk_t = scalar_t; \ - if (rope_is_neox) { \ - vllm::concat_and_cache_mla_rope_fused_kernel \ - <<>>( \ - positions.data_ptr(), q_pe.data_ptr(), \ - k_pe.data_ptr(), kv_c.data_ptr(), \ - rope_cos_sin_cache.data_ptr(), rot_dim, \ - q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \ - num_q_heads, reinterpret_cast(kv_cache.data_ptr()), \ - kv_cache_slot_mapping.data_ptr(), block_stride, \ - entry_stride, kv_lora_rank, block_size, \ - kv_cache_quant_scale.data_ptr()); \ - } else { \ - vllm::concat_and_cache_mla_rope_fused_kernel \ - <<>>( \ - positions.data_ptr(), q_pe.data_ptr(), \ - k_pe.data_ptr(), kv_c.data_ptr(), \ - rope_cos_sin_cache.data_ptr(), rot_dim, \ - q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \ - num_q_heads, reinterpret_cast(kv_cache.data_ptr()), \ - kv_cache_slot_mapping.data_ptr(), block_stride, \ - entry_stride, kv_lora_rank, block_size, \ - kv_cache_quant_scale.data_ptr()); \ - } \ - }); \ +#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE) \ + do { \ + VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] { \ + using qk_t = scalar_t; \ + VLLM_DISPATCH_FLOATING_TYPES( \ + rope_cos_sin_cache.scalar_type(), "rope_cos_sin_cache_scalar_type", \ + [&] { \ + using cos_sin_t = scalar_t; \ + if (rope_is_neox) { \ + vllm::concat_and_cache_mla_rope_fused_kernel< \ + qk_t, cos_sin_t, true, RAW_KV_T, CACHE_T, KV_DTYPE> \ + <<>>( \ + positions.data_ptr(), q_pe.data_ptr(), \ + k_pe.data_ptr(), kv_c.data_ptr(), \ + rope_cos_sin_cache.data_ptr(), rot_dim, \ + q_pe_stride_token, q_pe_stride_head, k_pe_stride, \ + kv_c_stride, num_q_heads, \ + reinterpret_cast(kv_cache.data_ptr()), \ + slot_mapping.data_ptr(), block_stride, \ + entry_stride, kv_lora_rank, block_size, \ + kv_cache_quant_scale.data_ptr()); \ + } else { \ + vllm::concat_and_cache_mla_rope_fused_kernel< \ + qk_t, cos_sin_t, false, RAW_KV_T, CACHE_T, KV_DTYPE> \ + <<>>( \ + positions.data_ptr(), q_pe.data_ptr(), \ + k_pe.data_ptr(), kv_c.data_ptr(), \ + rope_cos_sin_cache.data_ptr(), rot_dim, \ + q_pe_stride_token, q_pe_stride_head, k_pe_stride, \ + kv_c_stride, num_q_heads, \ + reinterpret_cast(kv_cache.data_ptr()), \ + slot_mapping.data_ptr(), block_stride, \ + entry_stride, kv_lora_rank, block_size, \ + kv_cache_quant_scale.data_ptr()); \ + } \ + }); \ + }); \ } while (false) // Executes RoPE on q_pe and k_pe, then writes k_pe and kv_c in the kv cache. @@ -208,43 +214,52 @@ void concat_and_cache_mla_rope_fused( torch::Tensor& kv_c, // [num_tokens, kv_lora_rank] torch::Tensor& rope_cos_sin_cache, // [max_position, rot_dim] bool rope_is_neox, - torch::Tensor& - kv_cache_slot_mapping, // [num_tokens] or [num_actual_tokens] + torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank + rot_dim)] const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale) { - const int64_t num_tokens = q_pe.size(0); + // NOTE(woosuk): In vLLM V1, query/key/position.size(0) can be different from + // slot_mapping.size(0) because of padding for CUDA graphs. + // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because + // both include padding. + // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0) + // since key includes padding for CUDA graphs, while slot_mapping does not. + // In this case, slot_mapping.size(0) represents the actual number of tokens + // before padding. + // For compatibility with both cases, we use slot_mapping.size(0) as the + // number of tokens. + int num_tokens = slot_mapping.size(0); + int num_padded_tokens = q_pe.size(0); + TORCH_CHECK_GE(num_padded_tokens, num_tokens); const int num_q_heads = q_pe.size(1); const int rot_dim = q_pe.size(2); const int kv_lora_rank = kv_c.size(1); - TORCH_CHECK(positions.size(0) >= - num_tokens); // CUDA Graphs might pad this for us + TORCH_CHECK_EQ(positions.size(0), num_padded_tokens); TORCH_CHECK_EQ(positions.dim(), 1); TORCH_CHECK_EQ(positions.scalar_type(), c10::ScalarType::Long); - TORCH_CHECK_EQ(q_pe.size(0), num_tokens); + TORCH_CHECK_EQ(q_pe.dim(), 3); + TORCH_CHECK_EQ(q_pe.size(0), num_padded_tokens); TORCH_CHECK_EQ(q_pe.size(1), num_q_heads); TORCH_CHECK_EQ(q_pe.size(2), rot_dim); - TORCH_CHECK_EQ(q_pe.dim(), 3); - TORCH_CHECK_EQ(k_pe.size(0), num_tokens); - TORCH_CHECK_EQ(k_pe.size(1), rot_dim); TORCH_CHECK_EQ(k_pe.dim(), 2); + TORCH_CHECK_EQ(k_pe.size(0), num_padded_tokens); + TORCH_CHECK_EQ(k_pe.size(1), rot_dim); TORCH_CHECK_EQ(k_pe.scalar_type(), q_pe.scalar_type()); - TORCH_CHECK_EQ(kv_c.size(0), num_tokens); - TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank); TORCH_CHECK_EQ(kv_c.dim(), 2); + TORCH_CHECK_EQ(kv_c.size(0), num_padded_tokens); + TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank); TORCH_CHECK_EQ(kv_c.scalar_type(), q_pe.scalar_type()); TORCH_CHECK_EQ(kv_c.dtype(), q_pe.dtype()); TORCH_CHECK_EQ(rope_cos_sin_cache.size(1), rot_dim); - TORCH_CHECK_EQ(rope_cos_sin_cache.scalar_type(), q_pe.scalar_type()); - TORCH_CHECK_EQ(kv_cache_slot_mapping.size(0), num_tokens); - TORCH_CHECK_EQ(kv_cache_slot_mapping.scalar_type(), c10::ScalarType::Long); + TORCH_CHECK_EQ(slot_mapping.size(0), num_tokens); + TORCH_CHECK_EQ(slot_mapping.scalar_type(), c10::ScalarType::Long); TORCH_CHECK_EQ(kv_cache.size(2), kv_lora_rank + rot_dim); TORCH_CHECK_EQ(kv_cache.dim(), 3); diff --git a/csrc/core/batch_invariant.hpp b/csrc/core/batch_invariant.hpp index fffe96b86857..8273bc74b1ef 100644 --- a/csrc/core/batch_invariant.hpp +++ b/csrc/core/batch_invariant.hpp @@ -1,7 +1,6 @@ #pragma once #include #include -#include namespace vllm { diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu index e910103c4eae..84040a6a2218 100644 --- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu +++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu @@ -1,5 +1,6 @@ #include "scaled_mm_kernels.hpp" #include "scaled_mm_sm100_fp8_dispatch.cuh" +#include "core/batch_invariant.hpp" namespace vllm { @@ -13,9 +14,17 @@ void cutlass_scaled_mm_sm100_fp8( STD_TORCH_CHECK(bias->scalar_type() == out.scalar_type(), "currently bias dtype must match output dtype ", out.scalar_type()); + if (vllm_is_batch_invariant()) { + return cutlass_scaled_mm_sm100_fp8_batch_invariant_epilogue( + out, a, b, a_scales, b_scales, *bias); + } return cutlass_scaled_mm_sm100_fp8_epilogue(out, a, b, a_scales, b_scales, *bias); } else { + if (vllm_is_batch_invariant()) { + return cutlass_scaled_mm_sm100_fp8_batch_invariant_epilogue( + out, a, b, a_scales, b_scales); + } return cutlass_scaled_mm_sm100_fp8_epilogue(out, a, b, a_scales, b_scales); } diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh index 5cd55f0198c2..f790b3653d57 100644 --- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh +++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh @@ -294,6 +294,34 @@ inline void cutlass_gemm_sm100_fp8_dispatch( } } +template +inline void cutlass_gemm_sm100_fp8_batch_invariant_dispatch( + torch::stable::Tensor& out, torch::stable::Tensor const& a, + torch::stable::Tensor const& b, torch::stable::Tensor const& a_scales, + torch::stable::Tensor const& b_scales, EpilogueArgs&&... args) { + static_assert(std::is_same()); + STD_TORCH_CHECK(a.scalar_type() == + torch::headeronly::ScalarType::Float8_e4m3fn); + STD_TORCH_CHECK(b.scalar_type() == + torch::headeronly::ScalarType::Float8_e4m3fn); + + using Cutlass3xGemmM64SwapAB = + typename sm100_fp8_config_M64_swap_ab::Cutlass3xGemm; + using Cutlass3xGemmM64 = + typename sm100_fp8_config_M64::Cutlass3xGemm; + + // keep the CUTLASS config independent of M for batch invariance + uint32_t const k = a.size(1); + if (k < 4096) { + return cutlass_gemm_caller_sm100_fp8( + out, a, b, a_scales, b_scales, std::forward(args)...); + } + return cutlass_gemm_caller_sm100_fp8( + out, a, b, b_scales, a_scales, std::forward(args)...); +} + template void cutlass_scaled_mm_sm100_fp8_epilogue(torch::stable::Tensor& out, torch::stable::Tensor const& a, @@ -320,4 +348,28 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::stable::Tensor& out, } } +template +void cutlass_scaled_mm_sm100_fp8_batch_invariant_epilogue( + torch::stable::Tensor& out, torch::stable::Tensor const& a, + torch::stable::Tensor const& b, torch::stable::Tensor const& a_scales, + torch::stable::Tensor const& b_scales, EpilogueArgs&&... epilogue_args) { + STD_TORCH_CHECK(a.scalar_type() == + torch::headeronly::ScalarType::Float8_e4m3fn); + STD_TORCH_CHECK(b.scalar_type() == + torch::headeronly::ScalarType::Float8_e4m3fn); + + if (out.scalar_type() == torch::headeronly::ScalarType::BFloat16) { + return cutlass_gemm_sm100_fp8_batch_invariant_dispatch< + cutlass::float_e4m3_t, cutlass::bfloat16_t, EnableBias>( + out, a, b, a_scales, b_scales, + std::forward(epilogue_args)...); + } else { + STD_TORCH_CHECK(out.scalar_type() == torch::headeronly::ScalarType::Half); + return cutlass_gemm_sm100_fp8_batch_invariant_dispatch< + cutlass::float_e4m3_t, cutlass::half_t, EnableBias>( + out, a, b, a_scales, b_scales, + std::forward(epilogue_args)...); + } +} + } // namespace vllm diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu index fb84faa2a41a..972d6c626062 100644 --- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu +++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu @@ -1,5 +1,6 @@ #include "scaled_mm_kernels.hpp" #include "scaled_mm_sm120_fp8_dispatch.cuh" +#include "core/batch_invariant.hpp" #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" namespace vllm { @@ -14,9 +15,17 @@ void cutlass_scaled_mm_sm120_fp8( STD_TORCH_CHECK(bias->scalar_type() == out.scalar_type(), "currently bias dtype must match output dtype ", out.scalar_type()); + if (vllm_is_batch_invariant()) { + return cutlass_scaled_mm_sm120_fp8_batch_invariant_epilogue< + c3x::ScaledEpilogueBias>(out, a, b, a_scales, b_scales, *bias); + } return cutlass_scaled_mm_sm120_fp8_epilogue( out, a, b, a_scales, b_scales, *bias); } else { + if (vllm_is_batch_invariant()) { + return cutlass_scaled_mm_sm120_fp8_batch_invariant_epilogue< + c3x::ScaledEpilogue>(out, a, b, a_scales, b_scales); + } return cutlass_scaled_mm_sm120_fp8_epilogue( out, a, b, a_scales, b_scales); } diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh index 245f5c10fcad..226e4f7a6bdb 100644 --- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh +++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh @@ -179,6 +179,26 @@ inline void cutlass_gemm_sm120_fp8_dispatch(torch::stable::Tensor& out, out, a, b, std::forward(args)...); } +template typename Epilogue, + typename... EpilogueArgs> +inline void cutlass_gemm_sm120_fp8_batch_invariant_dispatch( + torch::stable::Tensor& out, torch::stable::Tensor const& a, + torch::stable::Tensor const& b, EpilogueArgs&&... args) { + static_assert(std::is_same()); + STD_TORCH_CHECK(a.scalar_type() == + torch::headeronly::ScalarType::Float8_e4m3fn); + STD_TORCH_CHECK(b.scalar_type() == + torch::headeronly::ScalarType::Float8_e4m3fn); + + using Cutlass3xGemmM64 = + typename sm120_fp8_config_M64::Cutlass3xGemm; + + // keep the CUTLASS config independent of M for batch invariance + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); +} + template