vllm-project · kzawora-intel · Jul 11, 2025 · Jun 25, 2025 · Jun 26, 2025 · Jun 26, 2025
@@ -0,0 +1,3 @@
+self-hosted-runner:
+  labels:
+    - ucb-vllm-cicd-g2
@@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "actionlint",
+      "pattern": [
+        {
+          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "message": 4,
+          "code": 5
+        }
+      ]
+    }
+  ]
+}
@@ -0,0 +1,16 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "mypy",
+      "pattern": [
+        {
+          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
+          "file": 1,
+          "line": 2,
+          "severity": 3,
+          "message": 4
+        }
+      ]
+    }
+  ]
+}
@@ -0,0 +1,20 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+      with:
+        python-version: "3.12"
+    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
+    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      with:
+        extra_args: --all-files --hook-stage manual
@@ -1,7 +1,6 @@
 # version file generated by setuptools-scm
 /vllm_hpu/_version.py
 
-
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

@@ -47,13 +47,6 @@ repos:
     types: [python]
     additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
     stages: [manual] # Don't run in CI
-  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.9
-    entry: tools/mypy.sh 1 "3.9"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
   - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.10
     entry: tools/mypy.sh 1 "3.10"

@@ -9,8 +9,6 @@
   <img src="./docs/assets/logos/gaudi-logo.png" alt="Intel-Gaudi" width="30%">
 </p>
 
-
-
 vLLM Gaudi plugin (vllm-gaudi) integrates Intel Gaudi accelerators with vLLM to optimize large language model inference.
 
 This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162) and [[RFC]: Enhancing vLLM Plugin Architecture](https://github.com/vllm-project/vllm/issues/19161) principles, providing a modular interface for Intel Gaudi hardware.

@@ -22,4 +22,4 @@ This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-proj
 Learn more:
 
 📚 [Intel Gaudi Documentation](https://docs.habana.ai/en/v1.21.1/index.html)  
-🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html)
+🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html)
@@ -1,5 +1,5 @@
 # Summary
 
 [](){ #pkg_overview }
-### Full package overview
+## Full package overview
 ::: vllm_hpu
@@ -1,3 +1,3 @@
 # Configuration Options
 
-WIP
+WIP
@@ -55,4 +55,4 @@ Sequence group cmpl-3cbf19b0c6d74b3f90b5d5db2ed2385e-0 is preempted by Preemptio
 
 ## Multi-Step Scheduling Feature Usage
 
-Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details.
+Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details.
@@ -61,4 +61,4 @@ Please refer to this [collection](https://github.com/HabanaAI/Gaudi-tutorials/tr
 - llama-3.1-8b-instruct_gaudi3_1.20_contextlen-2k
 - llama-3.1-8b-instruct_gaudi3_1.20_contextlen-4k
 - llama-3.3-70b-instruct_gaudi3_1.20_contextlen-2k
-- llama-3.3-70b-instruct_gaudi3_1.20_contextlen-4k
+- llama-3.3-70b-instruct_gaudi3_1.20_contextlen-4k
@@ -1,3 +1,3 @@
 # Optimization and Tuning
 
-WIP
+WIP
@@ -3,4 +3,4 @@ title: vLLM's Plugin System
 ---
 [](){ #plugin-system }
 
-WIP
+WIP
@@ -1,3 +1,3 @@
 # Developer Guide
 
-WIP
+WIP
@@ -1,3 +1,3 @@
 # CI Failures
 
-WIP
+WIP
@@ -1,3 +1,3 @@
 # Profiling vLLM
 
-WIP
+WIP
@@ -156,4 +156,4 @@ INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) u
 INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
 INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-```
+```
@@ -3,4 +3,4 @@ title: Compatibility Matrix
 ---
 [](){ #compatibility-matrix }
 
-WIP
+WIP
@@ -53,4 +53,4 @@ llm.llm_engine.model_executor.shutdown()
 ## Device for the Model's Weights Uploading
 
 The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution.
-This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
+This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
@@ -15,7 +15,6 @@ This guide provides instructions on running vLLM with Intel Gaudi devices.
     To achieve the best performance on HPU, please follow the methods outlined in the
     [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
-
 ## Quick Start Using Dockerfile
 # --8<-- [start:docker_quickstart]
 Set up the container with the latest Intel Gaudi Software Suite release using the Dockerfile.
@@ -46,12 +45,10 @@ Set up the container with the latest Intel Gaudi Software Suite release using th
 ### Environment Verification
 To verify that the Intel Gaudi software was correctly installed, run the following:
 
-```{.console}
-$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-$ pip list | grep neural # verify that neural-compressor is installed
-```
+    $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+    $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+    $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+    $ pip list | grep neural # verify that neural-compressor is installed
 
 Refer to [System Verification and Final Tests](https://docs.habana.ai/en/latest/Installation_Guide/System_Verification_and_Final_Tests.html) for more details.
 
@@ -62,10 +59,8 @@ Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Instal
 
 Use the following commands to run a Docker image. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
 
-```{.console}
-docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-```
+    docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+    docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 
 ### Build and Install vLLM
 
@@ -119,4 +114,3 @@ Currently, multiple ways are provided which can be used to install vLLM with Int
     cd vllm-hpu
     pip install -e .
     ```
-
@@ -19,7 +19,6 @@ This guide will help you quickly get started with vLLM to perform:
     To achieve the best performance on HPU, please follow the methods outlined in the
     [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
-
 ## Quick Start Using Dockerfile
 
 --8<-- "docs/getting_started/installation.md:docker_quickstart"
@@ -54,4 +53,4 @@ This guide will help you quickly get started with vLLM to perform:
 
 === "OpenAI Chat Completions API with vLLM"
 
-    WIP
+    WIP
@@ -1,3 +1,3 @@
 # Using vLLM x Intel Gaudi
 
-WIP
+WIP
@@ -3,4 +3,4 @@ title: Frequently Asked Questions
 ---
 [](){ #faq }
 
-WIP
+WIP
@@ -1,3 +1,3 @@
 # Metrics
 
-WIP
+WIP
@@ -3,4 +3,4 @@ title: Troubleshooting
 ---
 [](){ #troubleshooting }
 
-WIP
+WIP
@@ -1,3 +1,3 @@
 # vLLM V1 Support
 
-WIP
+WIP
@@ -16,7 +16,6 @@ license = "Apache-2.0"
 readme = "README.md"
 description = "HPU plugin package for vLLM."
 classifiers = [
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -42,11 +41,13 @@ include = ["vllm_hpu"]
 
 [tool.yapfignore]
 ignore_patterns = [
-    "build/**",
+    "build/**", 
+    "vllm_hpu/extension/**"  # NOTE(kzawora): re-enable this once extension refactor is ready
 ]
 
 [tool.ruff]
 # Allow lines to be as long as 80.
+extend-exclude = ["vllm_hpu/extension/**"]  # NOTE(kzawora): re-enable this once extension refactor is ready
 line-length = 80
 
 [tool.ruff.lint]
@@ -79,10 +80,18 @@ ignore = [
 ]
 
 [tool.mypy]
+plugins = ['pydantic.mypy']
 ignore_missing_imports = true
+explicit_package_bases = true
 check_untyped_defs = true
 follow_imports = "silent"
 
+# After fixing type errors resulting from follow_imports: "skip" -> "silent",
+# move the directory here and remove it from tools/mypy.sh
+files = [
+    "vllm_hpu/*.py",
+]
+
 
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile, ElementE"

@@ -1,10 +1,5 @@
-import importlib.util
 import logging
 import os
-import subprocess
-import sys
-from sysconfig import get_paths
-from typing import Dict, List
 
 from setuptools import setup, find_packages
 from setuptools_scm import get_version
@@ -20,13 +15,15 @@
 logger = logging.getLogger(__name__)
 ext_modules = []
 
+
 def get_path(*filepath) -> str:
     return os.path.join(ROOT_DIR, *filepath)
 
-def get_requirements() -> List[str]:
+
+def get_requirements() -> list[str]:
     """Get Python package dependencies from requirements.txt."""
 
-    def _read_requirements(filename: str) -> List[str]:
+    def _read_requirements(filename: str) -> list[str]:
         with open(get_path(filename)) as f:
             requirements = f.read().strip().split("\n")
         resolved_requirements = []
@@ -44,16 +41,17 @@ def _read_requirements(filename: str) -> List[str]:
     except ValueError:
         print("Failed to read requirements.txt in vllm_hpu.")
     return requirements
-
+
+
 setup(
     name="vllm_hpu",
     version=VERSION,
     author="Intel",
-    long_description="HPU plugin package for vLLM.",
+    long_description="Intel Gaudi plugin package for vLLM.",
     long_description_content_type="text/markdown",
-    url="https://github.com/vllm-project/vllm-hpu",
+    url="https://github.com/vllm-project/vllm-gaudi",
     project_urls={
-        "Homepage": "https://github.com/vllm-project/vllm-hpu",
+        "Homepage": "https://github.com/vllm-project/vllm-gaudi",
     },
     classifiers=[
         "Programming Language :: Python :: 3",
@@ -68,4 +66,4 @@ def _read_requirements(filename: str) -> List[str]:
         "vllm.platform_plugins": ["hpu = vllm_hpu:register"],
         "vllm.general_plugins": ["hpu_custom_ops = vllm_hpu:register_ops"],
     },
-)
+)
@@ -120,7 +120,6 @@ def launch_simple(eval_config):
     generated_text = ""
     for output in outputs:
         generated_text += output.outputs[0].text
-    found_countries = []
 
     european_countries = [
         "Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus",
@@ -134,7 +133,7 @@ def launch_simple(eval_config):
         "Spain", "Sweden", "Switzerland", "Turkey", "Ukraine",
         "United Kingdom", "Vatican City"
     ]
-    found_countries = []
+    found_countries: list[str] = []
     for country in european_countries:
         if country in generated_text:
             found_countries.append(country)

@@ -21,8 +21,9 @@ run_mypy() {
 }
 
 run_mypy # Note that this is less strict than CI
+run_mypy tests
 run_mypy vllm_hpu/attention
 run_mypy vllm_hpu/distributed
+#run_mypy vllm_hpu/extension # NOTE(kzawora): re-enable this once extension refactor is ready
 run_mypy vllm_hpu/ops
-run_mypy vllm_hpu/worker
 run_mypy vllm_hpu/v1
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,8 +9,6 @@ @@
       <img src="./docs/assets/logos/gaudi-logo.png" alt="Intel-Gaudi" width="30%">
     </p>
     vLLM Gaudi plugin (vllm-gaudi) integrates Intel Gaudi accelerators with vLLM to optimize large language model inference.
     This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162) and [[RFC]: Enhancing vLLM Plugin Architecture](https://github.com/vllm-project/vllm/issues/19161) principles, providing a modular interface for Intel Gaudi hardware.
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -55,4 +55,4 @@ Sequence group cmpl-3cbf19b0c6d74b3f90b5d5db2ed2385e-0 is preempted by Preemptio

		## Multi-Step Scheduling Feature Usage

		Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details.
		Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details.