Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/actionlint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
self-hosted-runner:
labels:
- ucb-vllm-cicd-g2
17 changes: 17 additions & 0 deletions .github/workflows/matchers/actionlint.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"problemMatcher": [
{
"owner": "actionlint",
"pattern": [
{
"regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
"file": 1,
"line": 2,
"column": 3,
"message": 4,
"code": 5
}
]
}
]
}
16 changes: 16 additions & 0 deletions .github/workflows/matchers/mypy.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"problemMatcher": [
{
"owner": "mypy",
"pattern": [
{
"regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
"file": 1,
"line": 2,
"severity": 3,
"message": 4
}
]
}
]
}
20 changes: 20 additions & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: pre-commit

on:
pull_request:
push:
branches: [main]

jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with:
extra_args: --all-files --hook-stage manual
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# version file generated by setuptools-scm
/vllm_hpu/_version.py


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
7 changes: 0 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,6 @@ repos:
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
stages: [manual] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
entry: tools/mypy.sh 1 "3.9"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
entry: tools/mypy.sh 1 "3.10"
Expand Down
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
<img src="./docs/assets/logos/gaudi-logo.png" alt="Intel-Gaudi" width="30%">
</p>



vLLM Gaudi plugin (vllm-gaudi) integrates Intel Gaudi accelerators with vLLM to optimize large language model inference.

This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162) and [[RFC]: Enhancing vLLM Plugin Architecture](https://github.com/vllm-project/vllm/issues/19161) principles, providing a modular interface for Intel Gaudi hardware.
Expand Down
2 changes: 1 addition & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-proj
Learn more:

📚 [Intel Gaudi Documentation](https://docs.habana.ai/en/v1.21.1/index.html)
🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html)
🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html)
2 changes: 1 addition & 1 deletion docs/api/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Summary

[](){ #pkg_overview }
### Full package overview
## Full package overview
::: vllm_hpu
2 changes: 1 addition & 1 deletion docs/configuration/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Configuration Options

WIP
WIP
2 changes: 1 addition & 1 deletion docs/configuration/long_context.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ Sequence group cmpl-3cbf19b0c6d74b3f90b5d5db2ed2385e-0 is preempted by Preemptio

## Multi-Step Scheduling Feature Usage

Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details.
Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details.
2 changes: 1 addition & 1 deletion docs/configuration/multi_node.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,4 @@ Please refer to this [collection](https://github.com/HabanaAI/Gaudi-tutorials/tr
- llama-3.1-8b-instruct_gaudi3_1.20_contextlen-2k
- llama-3.1-8b-instruct_gaudi3_1.20_contextlen-4k
- llama-3.3-70b-instruct_gaudi3_1.20_contextlen-2k
- llama-3.3-70b-instruct_gaudi3_1.20_contextlen-4k
- llama-3.3-70b-instruct_gaudi3_1.20_contextlen-4k
2 changes: 1 addition & 1 deletion docs/configuration/optimization.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Optimization and Tuning

WIP
WIP
2 changes: 1 addition & 1 deletion docs/design/plugin_system.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ title: vLLM's Plugin System
---
[](){ #plugin-system }

WIP
WIP
2 changes: 1 addition & 1 deletion docs/dev_guide/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Developer Guide

WIP
WIP
2 changes: 1 addition & 1 deletion docs/dev_guide/ci-failures.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# CI Failures

WIP
WIP
2 changes: 1 addition & 1 deletion docs/dev_guide/profiling.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Profiling vLLM

WIP
WIP
2 changes: 1 addition & 1 deletion docs/features/bucketing_mechanism.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,4 @@ INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) u
INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
```
```
2 changes: 1 addition & 1 deletion docs/features/compatibility_matrix.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ title: Compatibility Matrix
---
[](){ #compatibility-matrix }

WIP
WIP
2 changes: 1 addition & 1 deletion docs/features/quantization/inc.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ llm.llm_engine.model_executor.shutdown()
## Device for the Model's Weights Uploading

The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution.
This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
18 changes: 6 additions & 12 deletions docs/getting_started/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ This guide provides instructions on running vLLM with Intel Gaudi devices.
To achieve the best performance on HPU, please follow the methods outlined in the
[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).


## Quick Start Using Dockerfile
# --8<-- [start:docker_quickstart]
Set up the container with the latest Intel Gaudi Software Suite release using the Dockerfile.
Expand Down Expand Up @@ -46,12 +45,10 @@ Set up the container with the latest Intel Gaudi Software Suite release using th
### Environment Verification
To verify that the Intel Gaudi software was correctly installed, run the following:

```{.console}
$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
$ pip list | grep neural # verify that neural-compressor is installed
```
$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
$ pip list | grep neural # verify that neural-compressor is installed

Refer to [System Verification and Final Tests](https://docs.habana.ai/en/latest/Installation_Guide/System_Verification_and_Final_Tests.html) for more details.

Expand All @@ -62,10 +59,8 @@ Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Instal

Use the following commands to run a Docker image. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):

```{.console}
docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
```
docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest

### Build and Install vLLM

Expand Down Expand Up @@ -119,4 +114,3 @@ Currently, multiple ways are provided which can be used to install vLLM with Int
cd vllm-hpu
pip install -e .
```

3 changes: 1 addition & 2 deletions docs/getting_started/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ This guide will help you quickly get started with vLLM to perform:
To achieve the best performance on HPU, please follow the methods outlined in the
[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).


## Quick Start Using Dockerfile

--8<-- "docs/getting_started/installation.md:docker_quickstart"
Expand Down Expand Up @@ -54,4 +53,4 @@ This guide will help you quickly get started with vLLM to perform:

=== "OpenAI Chat Completions API with vLLM"

WIP
WIP
2 changes: 1 addition & 1 deletion docs/user_guide/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Using vLLM x Intel Gaudi

WIP
WIP
2 changes: 1 addition & 1 deletion docs/user_guide/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ title: Frequently Asked Questions
---
[](){ #faq }

WIP
WIP
2 changes: 1 addition & 1 deletion docs/user_guide/metrics.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Metrics

WIP
WIP
2 changes: 1 addition & 1 deletion docs/user_guide/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ title: Troubleshooting
---
[](){ #troubleshooting }

WIP
WIP
2 changes: 1 addition & 1 deletion docs/user_guide/v1_guide.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# vLLM V1 Support

WIP
WIP
13 changes: 11 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ license = "Apache-2.0"
readme = "README.md"
description = "HPU plugin package for vLLM."
classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
Expand All @@ -42,11 +41,13 @@ include = ["vllm_hpu"]

[tool.yapfignore]
ignore_patterns = [
"build/**",
"build/**",
"vllm_hpu/extension/**" # NOTE(kzawora): re-enable this once extension refactor is ready
]

[tool.ruff]
# Allow lines to be as long as 80.
extend-exclude = ["vllm_hpu/extension/**"] # NOTE(kzawora): re-enable this once extension refactor is ready
line-length = 80

[tool.ruff.lint]
Expand Down Expand Up @@ -79,10 +80,18 @@ ignore = [
]

[tool.mypy]
plugins = ['pydantic.mypy']
ignore_missing_imports = true
explicit_package_bases = true
check_untyped_defs = true
follow_imports = "silent"

# After fixing type errors resulting from follow_imports: "skip" -> "silent",
# move the directory here and remove it from tools/mypy.sh
files = [
"vllm_hpu/*.py",
]


[tool.codespell]
ignore-words-list = "dout, te, indicies, subtile, ElementE"
Expand Down
22 changes: 10 additions & 12 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
import importlib.util
import logging
import os
import subprocess
import sys
from sysconfig import get_paths
from typing import Dict, List

from setuptools import setup, find_packages
from setuptools_scm import get_version
Expand All @@ -20,13 +15,15 @@
logger = logging.getLogger(__name__)
ext_modules = []


def get_path(*filepath) -> str:
return os.path.join(ROOT_DIR, *filepath)

def get_requirements() -> List[str]:

def get_requirements() -> list[str]:
"""Get Python package dependencies from requirements.txt."""

def _read_requirements(filename: str) -> List[str]:
def _read_requirements(filename: str) -> list[str]:
with open(get_path(filename)) as f:
requirements = f.read().strip().split("\n")
resolved_requirements = []
Expand All @@ -44,16 +41,17 @@ def _read_requirements(filename: str) -> List[str]:
except ValueError:
print("Failed to read requirements.txt in vllm_hpu.")
return requirements



setup(
name="vllm_hpu",
version=VERSION,
author="Intel",
long_description="HPU plugin package for vLLM.",
long_description="Intel Gaudi plugin package for vLLM.",
long_description_content_type="text/markdown",
url="https://github.com/vllm-project/vllm-hpu",
url="https://github.com/vllm-project/vllm-gaudi",
project_urls={
"Homepage": "https://github.com/vllm-project/vllm-hpu",
"Homepage": "https://github.com/vllm-project/vllm-gaudi",
},
classifiers=[
"Programming Language :: Python :: 3",
Expand All @@ -68,4 +66,4 @@ def _read_requirements(filename: str) -> List[str]:
"vllm.platform_plugins": ["hpu = vllm_hpu:register"],
"vllm.general_plugins": ["hpu_custom_ops = vllm_hpu:register_ops"],
},
)
)
3 changes: 1 addition & 2 deletions tests/models/multimodal/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def launch_simple(eval_config):
generated_text = ""
for output in outputs:
generated_text += output.outputs[0].text
found_countries = []

european_countries = [
"Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus",
Expand All @@ -134,7 +133,7 @@ def launch_simple(eval_config):
"Spain", "Sweden", "Switzerland", "Turkey", "Ukraine",
"United Kingdom", "Vatican City"
]
found_countries = []
found_countries: list[str] = []
for country in european_countries:
if country in generated_text:
found_countries.append(country)
Expand Down
3 changes: 2 additions & 1 deletion tools/mypy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ run_mypy() {
}

run_mypy # Note that this is less strict than CI
run_mypy tests
run_mypy vllm_hpu/attention
run_mypy vllm_hpu/distributed
#run_mypy vllm_hpu/extension # NOTE(kzawora): re-enable this once extension refactor is ready
run_mypy vllm_hpu/ops
run_mypy vllm_hpu/worker
run_mypy vllm_hpu/v1
Loading