Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# version file generated by setuptools-scm
/vllm_hpu/_version.py
/vllm_gaudi/_version.py

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
2 changes: 1 addition & 1 deletion docs/.nav.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ nav:
- API Reference:
- Summary: api/README.md
- Contents:
- glob: api/vllm_hpu/*
- glob: api/vllm_gaudi/*
preserve_directory_names: true
2 changes: 1 addition & 1 deletion docs/api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

[](){ #pkg_overview }
## Full package overview
::: vllm_hpu
::: vllm_gaudi
2 changes: 1 addition & 1 deletion mkdocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ plugins:
- awesome-nav
# For API reference generation
- api-autonav:
modules: ["vllm_hpu"]
modules: ["vllm_gaudi"]
api_root_uri: "api"
- mkdocstrings:
handlers:
Expand Down
12 changes: 6 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"


[project]
name = "vllm_hpu"
name = "vllm_gaudi"
authors = [{name = "Intel"}]
license = "Apache-2.0"
readme = "README.md"
Expand All @@ -29,25 +29,25 @@ requires-python = ">=3.9,<3.13"
dynamic = [ "version", "dependencies", "optional-dependencies"]

[project.urls]
Homepage = "https://github.com/HabanaAI/vllm-hpu-extension"
Homepage = "https://github.com/vllm-project/vllm-gaudi"


[tool.setuptools_scm]
# no extra settings needed, presence enables setuptools-scm

[tool.setuptools.packages.find]
where = ["."]
include = ["vllm_hpu"]
include = ["vllm_gaudi"]

[tool.yapfignore]
ignore_patterns = [
"build/**",
"vllm_hpu/extension/**" # NOTE(kzawora): re-enable this once extension refactor is ready
"vllm_gaudi/extension/**" # NOTE(kzawora): re-enable this once extension refactor is ready
]

[tool.ruff]
# Allow lines to be as long as 80.
extend-exclude = ["vllm_hpu/extension/**"] # NOTE(kzawora): re-enable this once extension refactor is ready
extend-exclude = ["vllm_gaudi/extension/**"] # NOTE(kzawora): re-enable this once extension refactor is ready
line-length = 80

[tool.ruff.lint]
Expand Down Expand Up @@ -89,7 +89,7 @@ follow_imports = "silent"
# After fixing type errors resulting from follow_imports: "skip" -> "silent",
# move the directory here and remove it from tools/mypy.sh
files = [
"vllm_hpu/*.py",
"vllm_gaudi/*.py",
]


Expand Down
10 changes: 5 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from setuptools_scm import get_version

try:
VERSION = get_version(write_to="vllm_hpu/_version.py")
VERSION = get_version(write_to="vllm_gaudi/_version.py")
except LookupError:
# The checkout action in github action CI does not checkout the tag. It
# only checks out the commit. In this case, we set a dummy version.
Expand Down Expand Up @@ -39,12 +39,12 @@ def _read_requirements(filename: str) -> list[str]:
try:
requirements = _read_requirements("requirements.txt")
except ValueError:
print("Failed to read requirements.txt in vllm_hpu.")
print("Failed to read requirements.txt in vllm_gaudi.")
return requirements


setup(
name="vllm_hpu",
name="vllm_gaudi",
version=VERSION,
author="Intel",
long_description="Intel Gaudi plugin package for vLLM.",
Expand All @@ -63,7 +63,7 @@ def _read_requirements(filename: str) -> list[str]:
ext_modules=ext_modules,
extras_require={},
entry_points={
"vllm.platform_plugins": ["hpu = vllm_hpu:register"],
"vllm.general_plugins": ["hpu_custom_ops = vllm_hpu:register_ops"],
"vllm.platform_plugins": ["hpu = vllm_gaudi:register"],
"vllm.general_plugins": ["hpu_custom_ops = vllm_gaudi:register_ops"],
},
)
10 changes: 5 additions & 5 deletions tools/mypy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ run_mypy() {

run_mypy # Note that this is less strict than CI
run_mypy tests
run_mypy vllm_hpu/attention
run_mypy vllm_hpu/distributed
#run_mypy vllm_hpu/extension # NOTE(kzawora): re-enable this once extension refactor is ready
run_mypy vllm_hpu/ops
run_mypy vllm_hpu/v1
run_mypy vllm_gaudi/attention
run_mypy vllm_gaudi/distributed
#run_mypy vllm_gaudi/extension # NOTE(kzawora): re-enable this once extension refactor is ready
run_mypy vllm_gaudi/ops
run_mypy vllm_gaudi/v1
6 changes: 3 additions & 3 deletions vllm_hpu/__init__.py → vllm_gaudi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from vllm_hpu.platform import HpuPlatform
from vllm_gaudi.platform import HpuPlatform
import os


Expand All @@ -8,9 +8,9 @@ def register():
if os.getenv("VLLM_WEIGHT_LOAD_FORCE_SYNC",
"false").lower() in ("true", "1"):
HpuPlatform.set_synchronized_weight_loader()
return "vllm_hpu.platform.HpuPlatform"
return "vllm_gaudi.platform.HpuPlatform"


def register_ops():
"""Register custom ops for the HPU platform."""
import vllm_hpu.ops # noqa: F401
import vllm_gaudi.ops # noqa: F401
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,20 @@
from typing import Any, Optional

import torch
import vllm_hpu.extension.kernels as kernels
import vllm_hpu.extension.ops as ops
from vllm_hpu.extension.runtime import get_config
from vllm_hpu.extension.utils import (FP8Matmul, Matmul, ModuleFusedSDPA,
Softmax, VLLMFP8KVCache, VLLMKVCache)
import vllm_gaudi.extension.kernels as kernels
import vllm_gaudi.extension.ops as ops
from vllm_gaudi.extension.runtime import get_config
from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, ModuleFusedSDPA,
Softmax, VLLMFP8KVCache, VLLMKVCache)

from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionLayer,
AttentionMetadata, AttentionType)

from vllm.attention.backends.mla.common import MLACommonImpl
from vllm.attention.backends.utils import CommonAttentionState
from vllm_hpu.attention.ops.hpu_paged_attn import (HPUPagedAttention,
HPUPagedAttentionMetadata)
from vllm_gaudi.attention.ops.hpu_paged_attn import (HPUPagedAttention,
HPUPagedAttentionMetadata)
from vllm.logger import init_logger

logger = init_logger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import Optional

import torch
from vllm_hpu.extension import cache_ops, ops
from vllm_gaudi.extension import cache_ops, ops

# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
_PARTITION_SIZE = 512
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def get_bucketing_context():
use_exponential_bucketing = os.environ.get(
'VLLM_EXPONENTIAL_BUCKETING', 'true').lower() == 'true'
if use_exponential_bucketing:
from vllm_hpu.extension.bucketing.exponential import (
from vllm_gaudi.extension.bucketing.exponential import (
HPUExponentialBucketingContext as HPUBucketingContext)
else:
from vllm_hpu.extension.bucketing.linear import HPUBucketingContext
from vllm_gaudi.extension.bucketing.linear import HPUBucketingContext

return HPUBucketingContext
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing import Set, Tuple
from .common import WeakSingleton

from vllm_hpu.extension.runtime import get_config
from vllm_gaudi.extension.runtime import get_config

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Tuple
from .common import WeakSingleton

from vllm_hpu.extension.runtime import get_config
from vllm_gaudi.extension.runtime import get_config

logger = logging.getLogger(__name__)

Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def _get_hw(_):
return "gaudi2"
case htexp.synDeviceType.synDeviceGaudi3:
return "gaudi3"
from vllm_hpu.extension.utils import is_fake_hpu
from vllm_gaudi.extension.utils import is_fake_hpu
if is_fake_hpu():
return "cpu"
logger().warning(f'Unknown device type: {device_type}')
Expand All @@ -40,7 +40,7 @@ def _get_build(_):
if output.returncode == 0 and match:
return match.group('version')
# In cpu-test environment we don't have access to habana-torch-plugin
from vllm_hpu.extension.utils import is_fake_hpu
from vllm_gaudi.extension.utils import is_fake_hpu
result = '0.0.0.0' if is_fake_hpu() else None
logger().warning(f"Unable to detect habana-torch-plugin version! Returning: {result}")
return result
Expand All @@ -58,7 +58,7 @@ def set_vllm_config(cfg):
# t.compile is very picky about what functions we can call inside modules
# since this is the last step we can force recompilation of config to
# ensure all values are computed before entering the model
from vllm_hpu.extension.runtime import get_config
from vllm_gaudi.extension.runtime import get_config
get_config().finalize()


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
# LICENSE file in the root directory of this source tree.
###############################################################################

from vllm_hpu.extension.config import (Not, Hardware, VersionRange, ModelType,
from vllm_gaudi.extension.config import (Not, Hardware, VersionRange, ModelType,
Kernel, FirstEnabled, All, Value, Env,
Disabled, Engine, choice, boolean,
to_dict, split_values_and_flags)
from vllm_hpu.extension.kernels import fsdpa, block_softmax_adjustment
from vllm_gaudi.extension.kernels import fsdpa, block_softmax_adjustment


def get_user_flags():
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import torch.nn.functional as F
import math
import habana_frameworks.torch.core as htcore
from vllm_hpu.extension.runtime import get_config
from vllm_gaudi.extension.runtime import get_config
import habana_frameworks.torch.utils.experimental as htexp

is_hpu_gaudi2 = htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import uuid
from habana_frameworks.torch import torch

from vllm_hpu.extension.utils import is_fake_hpu
from vllm_gaudi.extension.utils import is_fake_hpu
from .logger import logger

class FileWriter(threading.Thread):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
###############################################################################


from vllm_hpu.extension.environment import get_environment
from vllm_hpu.extension.features import get_features, get_user_flags, get_experimental_flags
from vllm_hpu.extension.config import Config
from vllm_hpu.extension.logger import logger
from vllm_gaudi.extension.environment import get_environment
from vllm_gaudi.extension.features import get_features, get_user_flags, get_experimental_flags
from vllm_gaudi.extension.config import Config
from vllm_gaudi.extension.logger import logger


DETECTED = None
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

import pytest

import vllm_hpu.extension.bucketing.linear as linear
from vllm_hpu.extension.bucketing.exponential import HPUExponentialBucketingContext
import vllm_gaudi.extension.bucketing.linear as linear
from vllm_gaudi.extension.bucketing.exponential import HPUExponentialBucketingContext


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import os
import pytest
from vllm_hpu.extension.config import VersionRange, Config, Kernel, Env, boolean, All, Not, Eq, Enabled, FirstEnabled, choice
from vllm_gaudi.extension.config import VersionRange, Config, Kernel, Env, boolean, All, Not, Eq, Enabled, FirstEnabled, choice


def with_cfg(fn):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import habana_frameworks.torch as htorch
import torch

from vllm_hpu.extension.runtime import get_config
from vllm_gaudi.extension.runtime import get_config


@lru_cache(maxsize=None)
Expand Down
6 changes: 6 additions & 0 deletions vllm_gaudi/ops/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import vllm_gaudi.ops.hpu_fused_moe # noqa
import vllm_gaudi.ops.hpu_layernorm # noqa
import vllm_gaudi.ops.hpu_lora # noqa
import vllm_gaudi.ops.hpu_rotary_embedding # noqa
import vllm_gaudi.ops.hpu_compressed_tensors # noqa
import vllm_gaudi.ops.hpu_fp8 # noqa
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
compressed_tensors_moe)
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa: E501
CompressedTensorsW8A8Fp8MoEMethod)
import vllm_hpu.extension.ops as hpu_ops
from vllm_hpu.extension.ops import VllmMixtureOfExpertsOpFP8PerChannel
import vllm_gaudi.extension.ops as hpu_ops
from vllm_gaudi.extension.ops import VllmMixtureOfExpertsOpFP8PerChannel

SUPPORTED_STRATEGIES = [
QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR
Expand Down
8 changes: 4 additions & 4 deletions vllm_hpu/ops/hpu_fp8.py → vllm_gaudi/ops/hpu_fp8.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Callable, Optional

import torch
from vllm_hpu import envs
from vllm_gaudi import envs
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.fused_moe.layer import FusedMoE

Expand All @@ -10,9 +10,9 @@
OrigFp8LinearMethod,
Fp8MoEMethod,
Fp8Config)
import vllm_hpu.extension.ops as hpu_ops
from vllm_hpu.extension.ops import (VllmMixtureOfExpertsOpFP8PerChannel,
VllmMixtureOfExpertsOpFP8)
import vllm_gaudi.extension.ops as hpu_ops
from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOpFP8PerChannel,
VllmMixtureOfExpertsOpFP8)


class Fp8LinearMethod(OrigFp8LinearMethod):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import torch
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, UnquantizedFusedMoEMethod)
from vllm_hpu.extension.ops import (VllmMixtureOfExpertsOp)
from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp)


@UnquantizedFusedMoEMethod.register_oot
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def forward_oot(
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
from vllm_hpu.extension.kernels import rms_norm
from vllm_gaudi.extension.kernels import rms_norm
HPUFusedRMSNorm = rms_norm()
if residual is not None:
orig_shape = x.shape
Expand Down
File renamed without changes.
Loading