diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml new file mode 100644 index 0000000000..5e31008138 --- /dev/null +++ b/.github/actionlint.yaml @@ -0,0 +1,3 @@ +self-hosted-runner: + labels: + - ucb-vllm-cicd-g2 \ No newline at end of file diff --git a/.github/workflows/matchers/actionlint.json b/.github/workflows/matchers/actionlint.json new file mode 100644 index 0000000000..4613e1617b --- /dev/null +++ b/.github/workflows/matchers/actionlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "actionlint", + "pattern": [ + { + "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 0000000000..f048fce528 --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000..6ab63a4027 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,20 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + with: + python-version: "3.12" + - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" + - run: echo "::add-matcher::.github/workflows/matchers/mypy.json" + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + with: + extra_args: --all-files --hook-stage manual diff --git a/.gitignore b/.gitignore index 4a9f5518b4..0f414a587f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ # version file generated by setuptools-scm /vllm_hpu/_version.py - # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8e13940353..9224999e5e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,13 +47,6 @@ repos: types: [python] additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic] stages: [manual] # Don't run in CI - - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.9 - entry: tools/mypy.sh 1 "3.9" - language: python - types: [python] - additional_dependencies: *mypy_deps - stages: [manual] # Only run in CI - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.10 entry: tools/mypy.sh 1 "3.10" diff --git a/README.md b/README.md index 49716e7bdf..55e0b8e397 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,6 @@ Intel-Gaudi

- - vLLM Gaudi plugin (vllm-gaudi) integrates Intel Gaudi accelerators with vLLM to optimize large language model inference. This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162) and [[RFC]: Enhancing vLLM Plugin Architecture](https://github.com/vllm-project/vllm/issues/19161) principles, providing a modular interface for Intel Gaudi hardware. diff --git a/docs/README.md b/docs/README.md index ff93fbc031..c0b9069a57 100644 --- a/docs/README.md +++ b/docs/README.md @@ -22,4 +22,4 @@ This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-proj Learn more: 📚 [Intel Gaudi Documentation](https://docs.habana.ai/en/v1.21.1/index.html) -🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html) \ No newline at end of file +🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html) diff --git a/docs/api/README.md b/docs/api/README.md index 6115cf2d38..973b8233d6 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -1,5 +1,5 @@ # Summary [](){ #pkg_overview } -### Full package overview +## Full package overview ::: vllm_hpu diff --git a/docs/configuration/README.md b/docs/configuration/README.md index f8f6749f79..5f187c8c98 100644 --- a/docs/configuration/README.md +++ b/docs/configuration/README.md @@ -1,3 +1,3 @@ # Configuration Options -WIP \ No newline at end of file +WIP diff --git a/docs/configuration/long_context.md b/docs/configuration/long_context.md index cc307b6548..0ec99f4606 100644 --- a/docs/configuration/long_context.md +++ b/docs/configuration/long_context.md @@ -55,4 +55,4 @@ Sequence group cmpl-3cbf19b0c6d74b3f90b5d5db2ed2385e-0 is preempted by Preemptio ## Multi-Step Scheduling Feature Usage -Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details. \ No newline at end of file +Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details. diff --git a/docs/configuration/multi_node.md b/docs/configuration/multi_node.md index 06692cea65..eab902e21e 100644 --- a/docs/configuration/multi_node.md +++ b/docs/configuration/multi_node.md @@ -61,4 +61,4 @@ Please refer to this [collection](https://github.com/HabanaAI/Gaudi-tutorials/tr - llama-3.1-8b-instruct_gaudi3_1.20_contextlen-2k - llama-3.1-8b-instruct_gaudi3_1.20_contextlen-4k - llama-3.3-70b-instruct_gaudi3_1.20_contextlen-2k -- llama-3.3-70b-instruct_gaudi3_1.20_contextlen-4k \ No newline at end of file +- llama-3.3-70b-instruct_gaudi3_1.20_contextlen-4k diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index edf204480b..b6da15a893 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -1,3 +1,3 @@ # Optimization and Tuning -WIP \ No newline at end of file +WIP diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 312767bb95..0db0db58d6 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -3,4 +3,4 @@ title: vLLM's Plugin System --- [](){ #plugin-system } -WIP \ No newline at end of file +WIP diff --git a/docs/dev_guide/README.md b/docs/dev_guide/README.md index 61522d97c7..3f34007e2e 100644 --- a/docs/dev_guide/README.md +++ b/docs/dev_guide/README.md @@ -1,3 +1,3 @@ # Developer Guide -WIP \ No newline at end of file +WIP diff --git a/docs/dev_guide/ci-failures.md b/docs/dev_guide/ci-failures.md index 3b24621834..96f897c5fb 100644 --- a/docs/dev_guide/ci-failures.md +++ b/docs/dev_guide/ci-failures.md @@ -1,3 +1,3 @@ # CI Failures -WIP \ No newline at end of file +WIP diff --git a/docs/dev_guide/profiling.md b/docs/dev_guide/profiling.md index 406990eeec..f8795463ba 100644 --- a/docs/dev_guide/profiling.md +++ b/docs/dev_guide/profiling.md @@ -1,3 +1,3 @@ # Profiling vLLM -WIP \ No newline at end of file +WIP diff --git a/docs/features/bucketing_mechanism.md b/docs/features/bucketing_mechanism.md index 5659d7acc7..88d1f7d82a 100644 --- a/docs/features/bucketing_mechanism.md +++ b/docs/features/bucketing_mechanism.md @@ -156,4 +156,4 @@ INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) u INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) -``` \ No newline at end of file +``` diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md index 6e9226815f..7d68a744f9 100644 --- a/docs/features/compatibility_matrix.md +++ b/docs/features/compatibility_matrix.md @@ -3,4 +3,4 @@ title: Compatibility Matrix --- [](){ #compatibility-matrix } -WIP \ No newline at end of file +WIP diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md index dba5b612d0..d97a462f54 100644 --- a/docs/features/quantization/inc.md +++ b/docs/features/quantization/inc.md @@ -53,4 +53,4 @@ llm.llm_engine.model_executor.shutdown() ## Device for the Model's Weights Uploading The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution. -This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory. \ No newline at end of file +This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory. diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md index a6b83dd86e..8125c5365a 100644 --- a/docs/getting_started/installation.md +++ b/docs/getting_started/installation.md @@ -15,7 +15,6 @@ This guide provides instructions on running vLLM with Intel Gaudi devices. To achieve the best performance on HPU, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). - ## Quick Start Using Dockerfile # --8<-- [start:docker_quickstart] Set up the container with the latest Intel Gaudi Software Suite release using the Dockerfile. @@ -46,12 +45,10 @@ Set up the container with the latest Intel Gaudi Software Suite release using th ### Environment Verification To verify that the Intel Gaudi software was correctly installed, run the following: -```{.console} -$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible -$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed -$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed -$ pip list | grep neural # verify that neural-compressor is installed -``` + $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible + $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed + $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed + $ pip list | grep neural # verify that neural-compressor is installed Refer to [System Verification and Final Tests](https://docs.habana.ai/en/latest/Installation_Guide/System_Verification_and_Final_Tests.html) for more details. @@ -62,10 +59,8 @@ Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Instal Use the following commands to run a Docker image. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html): -```{.console} -docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest -``` + docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest + docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest ### Build and Install vLLM @@ -119,4 +114,3 @@ Currently, multiple ways are provided which can be used to install vLLM with Int cd vllm-hpu pip install -e . ``` - diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 0354d1d375..feaf0409ab 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -19,7 +19,6 @@ This guide will help you quickly get started with vLLM to perform: To achieve the best performance on HPU, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). - ## Quick Start Using Dockerfile --8<-- "docs/getting_started/installation.md:docker_quickstart" @@ -54,4 +53,4 @@ This guide will help you quickly get started with vLLM to perform: === "OpenAI Chat Completions API with vLLM" - WIP \ No newline at end of file + WIP diff --git a/docs/user_guide/README.md b/docs/user_guide/README.md index d3174ec0bb..72d69292c2 100644 --- a/docs/user_guide/README.md +++ b/docs/user_guide/README.md @@ -1,3 +1,3 @@ # Using vLLM x Intel Gaudi -WIP \ No newline at end of file +WIP diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md index e4ad4322dd..ced0f390f1 100644 --- a/docs/user_guide/faq.md +++ b/docs/user_guide/faq.md @@ -3,4 +3,4 @@ title: Frequently Asked Questions --- [](){ #faq } -WIP \ No newline at end of file +WIP diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index 318558e455..44f5e49f7c 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -1,3 +1,3 @@ # Metrics -WIP \ No newline at end of file +WIP diff --git a/docs/user_guide/troubleshooting.md b/docs/user_guide/troubleshooting.md index 1b0190e4b9..b07b113b8e 100644 --- a/docs/user_guide/troubleshooting.md +++ b/docs/user_guide/troubleshooting.md @@ -3,4 +3,4 @@ title: Troubleshooting --- [](){ #troubleshooting } -WIP \ No newline at end of file +WIP diff --git a/docs/user_guide/v1_guide.md b/docs/user_guide/v1_guide.md index a0ccd86bc0..407f1206ee 100644 --- a/docs/user_guide/v1_guide.md +++ b/docs/user_guide/v1_guide.md @@ -1,3 +1,3 @@ # vLLM V1 Support -WIP \ No newline at end of file +WIP diff --git a/pyproject.toml b/pyproject.toml index 00a2a05bdf..130a3c1ae8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,6 @@ license = "Apache-2.0" readme = "README.md" description = "HPU plugin package for vLLM." classifiers = [ - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -42,11 +41,13 @@ include = ["vllm_hpu"] [tool.yapfignore] ignore_patterns = [ - "build/**", + "build/**", + "vllm_hpu/extension/**" # NOTE(kzawora): re-enable this once extension refactor is ready ] [tool.ruff] # Allow lines to be as long as 80. +extend-exclude = ["vllm_hpu/extension/**"] # NOTE(kzawora): re-enable this once extension refactor is ready line-length = 80 [tool.ruff.lint] @@ -79,10 +80,18 @@ ignore = [ ] [tool.mypy] +plugins = ['pydantic.mypy'] ignore_missing_imports = true +explicit_package_bases = true check_untyped_defs = true follow_imports = "silent" +# After fixing type errors resulting from follow_imports: "skip" -> "silent", +# move the directory here and remove it from tools/mypy.sh +files = [ + "vllm_hpu/*.py", +] + [tool.codespell] ignore-words-list = "dout, te, indicies, subtile, ElementE" diff --git a/setup.py b/setup.py index ea056549e2..1bca1a9fc3 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,5 @@ -import importlib.util import logging import os -import subprocess -import sys -from sysconfig import get_paths -from typing import Dict, List from setuptools import setup, find_packages from setuptools_scm import get_version @@ -20,13 +15,15 @@ logger = logging.getLogger(__name__) ext_modules = [] + def get_path(*filepath) -> str: return os.path.join(ROOT_DIR, *filepath) -def get_requirements() -> List[str]: + +def get_requirements() -> list[str]: """Get Python package dependencies from requirements.txt.""" - def _read_requirements(filename: str) -> List[str]: + def _read_requirements(filename: str) -> list[str]: with open(get_path(filename)) as f: requirements = f.read().strip().split("\n") resolved_requirements = [] @@ -44,16 +41,17 @@ def _read_requirements(filename: str) -> List[str]: except ValueError: print("Failed to read requirements.txt in vllm_hpu.") return requirements - + + setup( name="vllm_hpu", version=VERSION, author="Intel", - long_description="HPU plugin package for vLLM.", + long_description="Intel Gaudi plugin package for vLLM.", long_description_content_type="text/markdown", - url="https://github.com/vllm-project/vllm-hpu", + url="https://github.com/vllm-project/vllm-gaudi", project_urls={ - "Homepage": "https://github.com/vllm-project/vllm-hpu", + "Homepage": "https://github.com/vllm-project/vllm-gaudi", }, classifiers=[ "Programming Language :: Python :: 3", @@ -68,4 +66,4 @@ def _read_requirements(filename: str) -> List[str]: "vllm.platform_plugins": ["hpu = vllm_hpu:register"], "vllm.general_plugins": ["hpu_custom_ops = vllm_hpu:register_ops"], }, -) \ No newline at end of file +) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index d86e4d3842..6a1f07b1f2 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -120,7 +120,6 @@ def launch_simple(eval_config): generated_text = "" for output in outputs: generated_text += output.outputs[0].text - found_countries = [] european_countries = [ "Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus", @@ -134,7 +133,7 @@ def launch_simple(eval_config): "Spain", "Sweden", "Switzerland", "Turkey", "Ukraine", "United Kingdom", "Vatican City" ] - found_countries = [] + found_countries: list[str] = [] for country in european_countries: if country in generated_text: found_countries.append(country) diff --git a/tools/mypy.sh b/tools/mypy.sh index 9fc69dafcd..0f9c6a312d 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -21,8 +21,9 @@ run_mypy() { } run_mypy # Note that this is less strict than CI +run_mypy tests run_mypy vllm_hpu/attention run_mypy vllm_hpu/distributed +#run_mypy vllm_hpu/extension # NOTE(kzawora): re-enable this once extension refactor is ready run_mypy vllm_hpu/ops -run_mypy vllm_hpu/worker run_mypy vllm_hpu/v1 diff --git a/vllm_hpu/__init__.py b/vllm_hpu/__init__.py index 700c632b9f..1fe7cdcf14 100644 --- a/vllm_hpu/__init__.py +++ b/vllm_hpu/__init__.py @@ -5,7 +5,8 @@ def register(): """Register the HPU platform.""" HpuPlatform.set_torch_compile() - if os.getenv("VLLM_WEIGHT_LOAD_FORCE_SYNC", "false").lower() in ("true", "1"): + if os.getenv("VLLM_WEIGHT_LOAD_FORCE_SYNC", + "false").lower() in ("true", "1"): HpuPlatform.set_synchronized_weight_loader() return "vllm_hpu.platform.HpuPlatform" diff --git a/vllm_hpu/attention/backends/hpu_attn.py b/vllm_hpu/attention/backends/hpu_attn.py index 0a11485338..f6cb36461a 100644 --- a/vllm_hpu/attention/backends/hpu_attn.py +++ b/vllm_hpu/attention/backends/hpu_attn.py @@ -279,6 +279,8 @@ def _forward_prefill( # type: ignore k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) if not self.use_merged_prefill: + assert attn_metadata.seq_lens_tensor is not None, \ + "seq_lens_tensor must be provided for prefill attention" batch_size = attn_metadata.seq_lens_tensor.shape[0] else: batch_size = 1 @@ -492,7 +494,7 @@ def forward( attn_metadata: HPUAttentionMetadata, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: - """Forward pass with xFormers and PagedAttention. + """Forward pass with PagedAttention. Args: query: shape = [num_tokens, num_heads * head_size] @@ -522,6 +524,8 @@ def forward( else: batch_size = 1 else: + assert attn_metadata.block_mapping is not None, \ + "seq_lens_tensor must be provided for attention" batch_size = attn_metadata.block_mapping.shape[1] num_tokens, hidden_size = query.shape seq_len = num_tokens // batch_size diff --git a/vllm_hpu/attention/ops/hpu_paged_attn.py b/vllm_hpu/attention/ops/hpu_paged_attn.py index 6bc608022a..27775a3b4d 100644 --- a/vllm_hpu/attention/ops/hpu_paged_attn.py +++ b/vllm_hpu/attention/ops/hpu_paged_attn.py @@ -5,7 +5,7 @@ ############################################################################### from dataclasses import dataclass -from typing import List, Optional, Tuple +from typing import Optional import torch from vllm_hpu.extension import cache_ops, ops @@ -27,7 +27,7 @@ class HPUPagedAttentionMetadata: class HPUPagedAttention: @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return list(range(1, 257)) @staticmethod @@ -36,7 +36,7 @@ def get_kv_cache_shape( block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (num_blocks * block_size, num_kv_heads, head_size) @staticmethod @@ -44,7 +44,7 @@ def split_kv_cache( kv_cache: torch.Tensor, num_kv_heads: int, head_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: key_cache = kv_cache[0] value_cache = kv_cache[1] return key_cache, value_cache @@ -66,8 +66,8 @@ def forward_decode(**kwargs) -> torch.Tensor: @staticmethod def swap_blocks( - src_kv_cache: Tuple[torch.Tensor, torch.Tensor], - dst_kv_cache: Tuple[torch.Tensor, torch.Tensor], + src_kv_cache: tuple[torch.Tensor, torch.Tensor], + dst_kv_cache: tuple[torch.Tensor, torch.Tensor], src_to_dsts: torch.Tensor, ) -> None: src_key_cache = src_kv_cache[0] @@ -80,7 +80,7 @@ def swap_blocks( @staticmethod def copy_blocks( - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: list[tuple[torch.Tensor, torch.Tensor]], src_to_dsts: torch.Tensor, ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] diff --git a/vllm_hpu/distributed/device_communicators/hpu_communicator.py b/vllm_hpu/distributed/device_communicators/hpu_communicator.py index 34b706c5ef..6bdaa43b2a 100644 --- a/vllm_hpu/distributed/device_communicators/hpu_communicator.py +++ b/vllm_hpu/distributed/device_communicators/hpu_communicator.py @@ -3,9 +3,8 @@ import torch import torch.distributed as dist -from vllm.platforms import current_platform - -from vllm.distributed.device_communicators.base_device_communicator import DeviceCommunicatorBase +from vllm.distributed.device_communicators.base_device_communicator \ + import DeviceCommunicatorBase import habana_frameworks.torch as htorch # noqa: F401 @@ -41,4 +40,4 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: (world_size * input_size[dim], ) + input_size[dim + 1:]) - return output_tensor \ No newline at end of file + return output_tensor diff --git a/vllm_hpu/ops/hpu_lora.py b/vllm_hpu/ops/hpu_lora.py index e106a117d2..9d254ff1c3 100644 --- a/vllm_hpu/ops/hpu_lora.py +++ b/vllm_hpu/ops/hpu_lora.py @@ -1,10 +1,11 @@ import torch import torch.nn.functional as F from vllm.model_executor.custom_op import CustomOp +from vllm.lora.layers import VocabParallelEmbeddingWithLoRA @CustomOp.register_oot(name='VocabParallelEmbeddingWithLoRA') -class HPUVocabParallelEmbeddingWithLoRA: +class HPUVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA): def forward_oot(self, x: torch.Tensor) -> torch.Tensor: # x need to reshaped into 2d as batch is there diff --git a/vllm_hpu/ops/hpu_rotary_embedding.py b/vllm_hpu/ops/hpu_rotary_embedding.py index cf64214155..14e426e7e4 100644 --- a/vllm_hpu/ops/hpu_rotary_embedding.py +++ b/vllm_hpu/ops/hpu_rotary_embedding.py @@ -669,7 +669,7 @@ def forward_oot( # type: ignore[override] key: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: # Ensure the cache is on the right device. - self.cos_sin_cache = self.cos_sin_cache.to(query.device) + self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device) cos_cache, sin_cache = self.cos_sin_cache.chunk(2, dim=-1) # shape: [577, 1, 44] diff --git a/vllm_hpu/platform.py b/vllm_hpu/platform.py index b790f66897..f89ca3e16e 100644 --- a/vllm_hpu/platform.py +++ b/vllm_hpu/platform.py @@ -144,7 +144,7 @@ def set_torch_compile(cls) -> None: torch._dynamo.config.disable = True # NOTE multi-HPU inference with HPUGraphs (lazy-only) # requires enabling lazy collectives - # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 + # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' @classmethod @@ -156,12 +156,13 @@ def set_weight_attrs( ): """Set attributes on a weight tensor. - This method is used to set attributes on a weight tensor. This method - will not overwrite existing attributes. + This method is used to set attributes on a weight tensor. + This method will not overwrite existing attributes. Args: weight: The weight tensor. - weight_attrs: A dictionary of attributes to set on the weight tensor. + weight_attrs: A dictionary of attributes to set on the weight + tensor. """ if weight_attrs is None: return @@ -169,14 +170,18 @@ def set_weight_attrs( assert not hasattr(weight, key), ( f"Overwriting existing tensor attribute: {key}") - # NOTE(woosuk): During weight loading, we often do something like: + # NOTE(woosuk): During weight loading, we often do something + # like: # narrowed_tensor = param.data.narrow(0, offset, len) # narrowed_tensor.copy_(real_weight) - # expecting narrowed_tensor and param.data to share the same storage. - # However, on TPUs, narrowed_tensor will lazily propagate to the base - # tensor, which is param.data, leading to the redundant memory usage. - # This sometimes causes OOM errors during model loading. To avoid this, - # we sync the param tensor after its weight loader is called. + # expecting narrowed_tensor and param.data to share the same + # storage. + # However, on TPUs, narrowed_tensor will lazily propagate to + # the base tensor, which is param.data, leading to the + # redundant memory usage. + # This sometimes causes OOM errors during model loading. To + # avoid this, we sync the param tensor after its weight loader + # is called. # TODO(woosuk): Remove this hack once we have a better solution. # NOTE(ksmusz): Issue seen in HPU also, same hack applied. if key == "weight_loader": diff --git a/vllm_hpu/utils.py b/vllm_hpu/utils.py index 740db06484..5080d68143 100644 --- a/vllm_hpu/utils.py +++ b/vllm_hpu/utils.py @@ -1,9 +1,7 @@ from functools import cache import os from vllm.utils import make_tensor_with_pad, TORCH_DTYPE_TO_NUMPY_DTYPE -from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, - Optional, Sequence, Tuple, Type, TypeVar, Union, cast, - overload) +from typing import (Optional, TypeVar, Union) import torch import numpy as np import numpy.typing as npt @@ -12,6 +10,7 @@ T = TypeVar("T") U = TypeVar("U") + @cache def is_fake_hpu() -> bool: return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0' @@ -109,4 +108,3 @@ def make_tensor_with_pad_align( tensor = tensor.pin_memory() return tensor - diff --git a/vllm_hpu/v1/worker/hpu_model_runner.py b/vllm_hpu/v1/worker/hpu_model_runner.py index fe66b1df7f..9d827c5455 100644 --- a/vllm_hpu/v1/worker/hpu_model_runner.py +++ b/vllm_hpu/v1/worker/hpu_model_runner.py @@ -814,7 +814,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) self.input_batch.block_table.append_row(new_block_ids, req_index) - + # For the last rank, we don't need to update the token_ids_cpu # because the sampled tokens are already cached. if not is_last_rank: @@ -822,17 +822,21 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: start_token_index = num_computed_tokens end_token_index = num_computed_tokens + len(new_token_ids) self.input_batch.token_ids_cpu[ - req_index, start_token_index:end_token_index] = new_token_ids - self.input_batch.num_tokens_no_spec[req_index] = end_token_index + req_index, + start_token_index:end_token_index] = new_token_ids + self.input_batch.num_tokens_no_spec[ + req_index] = end_token_index # Add spec_token_ids to token_ids_cpu. - spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get( - req_id, ()) + spec_token_ids = \ + scheduler_output.scheduled_spec_decode_tokens.get( + req_id, ()) if spec_token_ids: start_index = end_token_index end_token_index += len(spec_token_ids) self.input_batch.token_ids_cpu[ - req_index, start_index:end_token_index] = spec_token_ids - # NOTE(woosuk): `num_tokens` here may include spec decode tokens. + req_index, + start_index:end_token_index] = spec_token_ids + # NOTE(woosuk): `num_tokens` here may include spec decode tokens self.input_batch.num_tokens[req_index] = end_token_index # Check if the batch has changed. If not, we can skip copying the @@ -1669,7 +1673,8 @@ def execute_model( # NOTE(woosuk): As an exception, when using PP, the scheduler sends # the sampled tokens back, because there's no direct communication # between the first-stage worker and the last-stage worker. - for req_idx, sampled_ids in enumerate(postprocessed_sampled_token_ids[:num_reqs]): + for req_idx, sampled_ids in enumerate( + postprocessed_sampled_token_ids[:num_reqs]): if not sampled_ids: continue