From 203e2e7d0ed72ec01c43f078ff66d953ec363288 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 26 Jun 2025 14:34:47 +0300 Subject: [PATCH 1/5] Refactor vllm_hpu -> vllm_gaudi Signed-off-by: Konrad Zawora --- docs/.nav.yml | 2 +- mkdocs.yaml | 2 +- {vllm_hpu => vllm_gaudi}/__init__.py | 6 +++--- {vllm_hpu => vllm_gaudi}/_version.py | 0 {vllm_hpu => vllm_gaudi}/attention/__init__.py | 0 {vllm_hpu => vllm_gaudi}/attention/backends/__init__.py | 0 {vllm_hpu => vllm_gaudi}/attention/backends/hpu_attn.py | 0 {vllm_hpu => vllm_gaudi}/attention/ops/__init__.py | 0 {vllm_hpu => vllm_gaudi}/attention/ops/hpu_paged_attn.py | 0 {vllm_hpu => vllm_gaudi}/distributed/__init__.py | 0 .../distributed/device_communicators/__init__.py | 0 .../distributed/device_communicators/hpu_communicator.py | 0 {vllm_hpu => vllm_gaudi}/envs.py | 0 {vllm_hpu => vllm_gaudi}/extension/__init__.py | 0 {vllm_hpu => vllm_gaudi}/extension/awq_hpu.py | 0 {vllm_hpu => vllm_gaudi}/extension/bucketing/__init__.py | 0 {vllm_hpu => vllm_gaudi}/extension/bucketing/common.py | 0 {vllm_hpu => vllm_gaudi}/extension/bucketing/exponential.py | 0 {vllm_hpu => vllm_gaudi}/extension/bucketing/linear.py | 0 {vllm_hpu => vllm_gaudi}/extension/cache_ops.py | 0 {vllm_hpu => vllm_gaudi}/extension/config.py | 0 {vllm_hpu => vllm_gaudi}/extension/environment.py | 0 {vllm_hpu => vllm_gaudi}/extension/features.py | 0 {vllm_hpu => vllm_gaudi}/extension/gptq_hpu.py | 0 {vllm_hpu => vllm_gaudi}/extension/kernels.py | 0 {vllm_hpu => vllm_gaudi}/extension/logger.py | 0 {vllm_hpu => vllm_gaudi}/extension/ops.py | 0 {vllm_hpu => vllm_gaudi}/extension/profiler.py | 0 {vllm_hpu => vllm_gaudi}/extension/runtime.py | 0 {vllm_hpu => vllm_gaudi}/extension/scales.py | 0 {vllm_hpu => vllm_gaudi}/extension/test_bucketing.py | 4 ++-- {vllm_hpu => vllm_gaudi}/extension/test_flags.py | 0 {vllm_hpu => vllm_gaudi}/extension/utils.py | 0 {vllm_hpu => vllm_gaudi}/ops/__init__.py | 0 {vllm_hpu => vllm_gaudi}/ops/hpu_compressed_tensors.py | 0 {vllm_hpu => vllm_gaudi}/ops/hpu_fp8.py | 0 {vllm_hpu => vllm_gaudi}/ops/hpu_fused_moe.py | 0 {vllm_hpu => vllm_gaudi}/ops/hpu_layernorm.py | 0 {vllm_hpu => vllm_gaudi}/ops/hpu_lora.py | 0 {vllm_hpu => vllm_gaudi}/ops/hpu_rotary_embedding.py | 0 {vllm_hpu => vllm_gaudi}/platform.py | 6 +++--- {vllm_hpu => vllm_gaudi}/utils.py | 0 {vllm_hpu => vllm_gaudi}/v1/__init__.py | 0 {vllm_hpu => vllm_gaudi}/v1/attention/__init__.py | 0 {vllm_hpu => vllm_gaudi}/v1/attention/backends/__init__.py | 0 {vllm_hpu => vllm_gaudi}/v1/attention/backends/hpu_attn.py | 0 {vllm_hpu => vllm_gaudi}/v1/worker/__init__.py | 0 {vllm_hpu => vllm_gaudi}/v1/worker/hpu_input_batch.py | 0 {vllm_hpu => vllm_gaudi}/v1/worker/hpu_model_runner.py | 0 {vllm_hpu => vllm_gaudi}/v1/worker/hpu_worker.py | 0 50 files changed, 10 insertions(+), 10 deletions(-) rename {vllm_hpu => vllm_gaudi}/__init__.py (56%) rename {vllm_hpu => vllm_gaudi}/_version.py (100%) rename {vllm_hpu => vllm_gaudi}/attention/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/attention/backends/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/attention/backends/hpu_attn.py (100%) rename {vllm_hpu => vllm_gaudi}/attention/ops/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/attention/ops/hpu_paged_attn.py (100%) rename {vllm_hpu => vllm_gaudi}/distributed/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/distributed/device_communicators/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/distributed/device_communicators/hpu_communicator.py (100%) rename {vllm_hpu => vllm_gaudi}/envs.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/awq_hpu.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/bucketing/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/bucketing/common.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/bucketing/exponential.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/bucketing/linear.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/cache_ops.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/config.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/environment.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/features.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/gptq_hpu.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/kernels.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/logger.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/ops.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/profiler.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/runtime.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/scales.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/test_bucketing.py (98%) rename {vllm_hpu => vllm_gaudi}/extension/test_flags.py (100%) rename {vllm_hpu => vllm_gaudi}/extension/utils.py (100%) rename {vllm_hpu => vllm_gaudi}/ops/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/ops/hpu_compressed_tensors.py (100%) rename {vllm_hpu => vllm_gaudi}/ops/hpu_fp8.py (100%) rename {vllm_hpu => vllm_gaudi}/ops/hpu_fused_moe.py (100%) rename {vllm_hpu => vllm_gaudi}/ops/hpu_layernorm.py (100%) rename {vllm_hpu => vllm_gaudi}/ops/hpu_lora.py (100%) rename {vllm_hpu => vllm_gaudi}/ops/hpu_rotary_embedding.py (100%) rename {vllm_hpu => vllm_gaudi}/platform.py (95%) rename {vllm_hpu => vllm_gaudi}/utils.py (100%) rename {vllm_hpu => vllm_gaudi}/v1/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/v1/attention/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/v1/attention/backends/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/v1/attention/backends/hpu_attn.py (100%) rename {vllm_hpu => vllm_gaudi}/v1/worker/__init__.py (100%) rename {vllm_hpu => vllm_gaudi}/v1/worker/hpu_input_batch.py (100%) rename {vllm_hpu => vllm_gaudi}/v1/worker/hpu_model_runner.py (100%) rename {vllm_hpu => vllm_gaudi}/v1/worker/hpu_worker.py (100%) diff --git a/docs/.nav.yml b/docs/.nav.yml index ffbd20ec5e..42ace218e8 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -33,5 +33,5 @@ nav: - API Reference: - Summary: api/README.md - Contents: - - glob: api/vllm_hpu/* + - glob: api/vllm_gaudi/* preserve_directory_names: true \ No newline at end of file diff --git a/mkdocs.yaml b/mkdocs.yaml index e1ffa7c43b..4b141430cb 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -55,7 +55,7 @@ plugins: - awesome-nav # For API reference generation - api-autonav: - modules: ["vllm_hpu"] + modules: ["vllm_gaudi"] api_root_uri: "api" - mkdocstrings: handlers: diff --git a/vllm_hpu/__init__.py b/vllm_gaudi/__init__.py similarity index 56% rename from vllm_hpu/__init__.py rename to vllm_gaudi/__init__.py index 74eef05bf7..13c0b3f5ec 100644 --- a/vllm_hpu/__init__.py +++ b/vllm_gaudi/__init__.py @@ -1,12 +1,12 @@ -from vllm_hpu.platform import HpuPlatform +from vllm_gaudi.platform import HpuPlatform def register(): """Register the HPU platform.""" HpuPlatform.set_torch_compile() - return "vllm_hpu.platform.HpuPlatform" + return "vllm_gaudi.platform.HpuPlatform" def register_ops(): """Register custom ops for the HPU platform.""" - import vllm_hpu.ops # noqa: F401 + import vllm_gaudi.ops # noqa: F401 diff --git a/vllm_hpu/_version.py b/vllm_gaudi/_version.py similarity index 100% rename from vllm_hpu/_version.py rename to vllm_gaudi/_version.py diff --git a/vllm_hpu/attention/__init__.py b/vllm_gaudi/attention/__init__.py similarity index 100% rename from vllm_hpu/attention/__init__.py rename to vllm_gaudi/attention/__init__.py diff --git a/vllm_hpu/attention/backends/__init__.py b/vllm_gaudi/attention/backends/__init__.py similarity index 100% rename from vllm_hpu/attention/backends/__init__.py rename to vllm_gaudi/attention/backends/__init__.py diff --git a/vllm_hpu/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py similarity index 100% rename from vllm_hpu/attention/backends/hpu_attn.py rename to vllm_gaudi/attention/backends/hpu_attn.py diff --git a/vllm_hpu/attention/ops/__init__.py b/vllm_gaudi/attention/ops/__init__.py similarity index 100% rename from vllm_hpu/attention/ops/__init__.py rename to vllm_gaudi/attention/ops/__init__.py diff --git a/vllm_hpu/attention/ops/hpu_paged_attn.py b/vllm_gaudi/attention/ops/hpu_paged_attn.py similarity index 100% rename from vllm_hpu/attention/ops/hpu_paged_attn.py rename to vllm_gaudi/attention/ops/hpu_paged_attn.py diff --git a/vllm_hpu/distributed/__init__.py b/vllm_gaudi/distributed/__init__.py similarity index 100% rename from vllm_hpu/distributed/__init__.py rename to vllm_gaudi/distributed/__init__.py diff --git a/vllm_hpu/distributed/device_communicators/__init__.py b/vllm_gaudi/distributed/device_communicators/__init__.py similarity index 100% rename from vllm_hpu/distributed/device_communicators/__init__.py rename to vllm_gaudi/distributed/device_communicators/__init__.py diff --git a/vllm_hpu/distributed/device_communicators/hpu_communicator.py b/vllm_gaudi/distributed/device_communicators/hpu_communicator.py similarity index 100% rename from vllm_hpu/distributed/device_communicators/hpu_communicator.py rename to vllm_gaudi/distributed/device_communicators/hpu_communicator.py diff --git a/vllm_hpu/envs.py b/vllm_gaudi/envs.py similarity index 100% rename from vllm_hpu/envs.py rename to vllm_gaudi/envs.py diff --git a/vllm_hpu/extension/__init__.py b/vllm_gaudi/extension/__init__.py similarity index 100% rename from vllm_hpu/extension/__init__.py rename to vllm_gaudi/extension/__init__.py diff --git a/vllm_hpu/extension/awq_hpu.py b/vllm_gaudi/extension/awq_hpu.py similarity index 100% rename from vllm_hpu/extension/awq_hpu.py rename to vllm_gaudi/extension/awq_hpu.py diff --git a/vllm_hpu/extension/bucketing/__init__.py b/vllm_gaudi/extension/bucketing/__init__.py similarity index 100% rename from vllm_hpu/extension/bucketing/__init__.py rename to vllm_gaudi/extension/bucketing/__init__.py diff --git a/vllm_hpu/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py similarity index 100% rename from vllm_hpu/extension/bucketing/common.py rename to vllm_gaudi/extension/bucketing/common.py diff --git a/vllm_hpu/extension/bucketing/exponential.py b/vllm_gaudi/extension/bucketing/exponential.py similarity index 100% rename from vllm_hpu/extension/bucketing/exponential.py rename to vllm_gaudi/extension/bucketing/exponential.py diff --git a/vllm_hpu/extension/bucketing/linear.py b/vllm_gaudi/extension/bucketing/linear.py similarity index 100% rename from vllm_hpu/extension/bucketing/linear.py rename to vllm_gaudi/extension/bucketing/linear.py diff --git a/vllm_hpu/extension/cache_ops.py b/vllm_gaudi/extension/cache_ops.py similarity index 100% rename from vllm_hpu/extension/cache_ops.py rename to vllm_gaudi/extension/cache_ops.py diff --git a/vllm_hpu/extension/config.py b/vllm_gaudi/extension/config.py similarity index 100% rename from vllm_hpu/extension/config.py rename to vllm_gaudi/extension/config.py diff --git a/vllm_hpu/extension/environment.py b/vllm_gaudi/extension/environment.py similarity index 100% rename from vllm_hpu/extension/environment.py rename to vllm_gaudi/extension/environment.py diff --git a/vllm_hpu/extension/features.py b/vllm_gaudi/extension/features.py similarity index 100% rename from vllm_hpu/extension/features.py rename to vllm_gaudi/extension/features.py diff --git a/vllm_hpu/extension/gptq_hpu.py b/vllm_gaudi/extension/gptq_hpu.py similarity index 100% rename from vllm_hpu/extension/gptq_hpu.py rename to vllm_gaudi/extension/gptq_hpu.py diff --git a/vllm_hpu/extension/kernels.py b/vllm_gaudi/extension/kernels.py similarity index 100% rename from vllm_hpu/extension/kernels.py rename to vllm_gaudi/extension/kernels.py diff --git a/vllm_hpu/extension/logger.py b/vllm_gaudi/extension/logger.py similarity index 100% rename from vllm_hpu/extension/logger.py rename to vllm_gaudi/extension/logger.py diff --git a/vllm_hpu/extension/ops.py b/vllm_gaudi/extension/ops.py similarity index 100% rename from vllm_hpu/extension/ops.py rename to vllm_gaudi/extension/ops.py diff --git a/vllm_hpu/extension/profiler.py b/vllm_gaudi/extension/profiler.py similarity index 100% rename from vllm_hpu/extension/profiler.py rename to vllm_gaudi/extension/profiler.py diff --git a/vllm_hpu/extension/runtime.py b/vllm_gaudi/extension/runtime.py similarity index 100% rename from vllm_hpu/extension/runtime.py rename to vllm_gaudi/extension/runtime.py diff --git a/vllm_hpu/extension/scales.py b/vllm_gaudi/extension/scales.py similarity index 100% rename from vllm_hpu/extension/scales.py rename to vllm_gaudi/extension/scales.py diff --git a/vllm_hpu/extension/test_bucketing.py b/vllm_gaudi/extension/test_bucketing.py similarity index 98% rename from vllm_hpu/extension/test_bucketing.py rename to vllm_gaudi/extension/test_bucketing.py index f709265a02..8c315f6f6c 100644 --- a/vllm_hpu/extension/test_bucketing.py +++ b/vllm_gaudi/extension/test_bucketing.py @@ -8,8 +8,8 @@ import pytest -import vllm_hpu.extension.bucketing.linear as linear -from vllm_hpu.extension.bucketing.exponential import HPUExponentialBucketingContext +import vllm_gaudi.extension.bucketing.linear as linear +from vllm_gaudi.extension.bucketing.exponential import HPUExponentialBucketingContext @pytest.fixture diff --git a/vllm_hpu/extension/test_flags.py b/vllm_gaudi/extension/test_flags.py similarity index 100% rename from vllm_hpu/extension/test_flags.py rename to vllm_gaudi/extension/test_flags.py diff --git a/vllm_hpu/extension/utils.py b/vllm_gaudi/extension/utils.py similarity index 100% rename from vllm_hpu/extension/utils.py rename to vllm_gaudi/extension/utils.py diff --git a/vllm_hpu/ops/__init__.py b/vllm_gaudi/ops/__init__.py similarity index 100% rename from vllm_hpu/ops/__init__.py rename to vllm_gaudi/ops/__init__.py diff --git a/vllm_hpu/ops/hpu_compressed_tensors.py b/vllm_gaudi/ops/hpu_compressed_tensors.py similarity index 100% rename from vllm_hpu/ops/hpu_compressed_tensors.py rename to vllm_gaudi/ops/hpu_compressed_tensors.py diff --git a/vllm_hpu/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py similarity index 100% rename from vllm_hpu/ops/hpu_fp8.py rename to vllm_gaudi/ops/hpu_fp8.py diff --git a/vllm_hpu/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py similarity index 100% rename from vllm_hpu/ops/hpu_fused_moe.py rename to vllm_gaudi/ops/hpu_fused_moe.py diff --git a/vllm_hpu/ops/hpu_layernorm.py b/vllm_gaudi/ops/hpu_layernorm.py similarity index 100% rename from vllm_hpu/ops/hpu_layernorm.py rename to vllm_gaudi/ops/hpu_layernorm.py diff --git a/vllm_hpu/ops/hpu_lora.py b/vllm_gaudi/ops/hpu_lora.py similarity index 100% rename from vllm_hpu/ops/hpu_lora.py rename to vllm_gaudi/ops/hpu_lora.py diff --git a/vllm_hpu/ops/hpu_rotary_embedding.py b/vllm_gaudi/ops/hpu_rotary_embedding.py similarity index 100% rename from vllm_hpu/ops/hpu_rotary_embedding.py rename to vllm_gaudi/ops/hpu_rotary_embedding.py diff --git a/vllm_hpu/platform.py b/vllm_gaudi/platform.py similarity index 95% rename from vllm_hpu/platform.py rename to vllm_gaudi/platform.py index 431fdfd053..f0fb121ae0 100644 --- a/vllm_hpu/platform.py +++ b/vllm_gaudi/platform.py @@ -64,7 +64,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.worker_cls == "auto": if envs.VLLM_USE_V1: parallel_config.worker_cls = \ - "vllm_hpu.v1.worker.hpu_worker.HPUWorker" + "vllm_gaudi.v1.worker.hpu_worker.HPUWorker" else: parallel_config.worker_cls = \ "vllm.worker.hpu_worker.HPUWorker" @@ -117,11 +117,11 @@ def is_pin_memory_available(cls): @classmethod def get_punica_wrapper(cls) -> str: - return "vllm_hpu.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU" + return "vllm_gaudi.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU" @classmethod def get_device_communicator_cls(cls) -> str: - return "vllm_hpu.distributed.device_communicators.hpu_communicator.HpuCommunicator" # noqa + return "vllm_gaudi.distributed.device_communicators.hpu_communicator.HpuCommunicator" # noqa @classmethod def supports_structured_output(cls) -> bool: diff --git a/vllm_hpu/utils.py b/vllm_gaudi/utils.py similarity index 100% rename from vllm_hpu/utils.py rename to vllm_gaudi/utils.py diff --git a/vllm_hpu/v1/__init__.py b/vllm_gaudi/v1/__init__.py similarity index 100% rename from vllm_hpu/v1/__init__.py rename to vllm_gaudi/v1/__init__.py diff --git a/vllm_hpu/v1/attention/__init__.py b/vllm_gaudi/v1/attention/__init__.py similarity index 100% rename from vllm_hpu/v1/attention/__init__.py rename to vllm_gaudi/v1/attention/__init__.py diff --git a/vllm_hpu/v1/attention/backends/__init__.py b/vllm_gaudi/v1/attention/backends/__init__.py similarity index 100% rename from vllm_hpu/v1/attention/backends/__init__.py rename to vllm_gaudi/v1/attention/backends/__init__.py diff --git a/vllm_hpu/v1/attention/backends/hpu_attn.py b/vllm_gaudi/v1/attention/backends/hpu_attn.py similarity index 100% rename from vllm_hpu/v1/attention/backends/hpu_attn.py rename to vllm_gaudi/v1/attention/backends/hpu_attn.py diff --git a/vllm_hpu/v1/worker/__init__.py b/vllm_gaudi/v1/worker/__init__.py similarity index 100% rename from vllm_hpu/v1/worker/__init__.py rename to vllm_gaudi/v1/worker/__init__.py diff --git a/vllm_hpu/v1/worker/hpu_input_batch.py b/vllm_gaudi/v1/worker/hpu_input_batch.py similarity index 100% rename from vllm_hpu/v1/worker/hpu_input_batch.py rename to vllm_gaudi/v1/worker/hpu_input_batch.py diff --git a/vllm_hpu/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py similarity index 100% rename from vllm_hpu/v1/worker/hpu_model_runner.py rename to vllm_gaudi/v1/worker/hpu_model_runner.py diff --git a/vllm_hpu/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py similarity index 100% rename from vllm_hpu/v1/worker/hpu_worker.py rename to vllm_gaudi/v1/worker/hpu_worker.py From 39be26a9531ccd2aea4a761e87d855133e2424cb Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 11 Jul 2025 13:50:24 +0300 Subject: [PATCH 2/5] oh no Signed-off-by: Konrad Zawora --- pyproject.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 130a3c1ae8..c8e34d99a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta" [project] -name = "vllm_hpu" +name = "vllm_gaudi" authors = [{name = "Intel"}] license = "Apache-2.0" readme = "README.md" @@ -29,7 +29,7 @@ requires-python = ">=3.9,<3.13" dynamic = [ "version", "dependencies", "optional-dependencies"] [project.urls] -Homepage = "https://github.com/HabanaAI/vllm-hpu-extension" +Homepage = "https://github.com/vllm-project/vllm-gaudi" [tool.setuptools_scm] @@ -37,17 +37,17 @@ Homepage = "https://github.com/HabanaAI/vllm-hpu-extension" [tool.setuptools.packages.find] where = ["."] -include = ["vllm_hpu"] +include = ["vllm_gaudi"] [tool.yapfignore] ignore_patterns = [ "build/**", - "vllm_hpu/extension/**" # NOTE(kzawora): re-enable this once extension refactor is ready + "vllm_gaudi/extension/**" # NOTE(kzawora): re-enable this once extension refactor is ready ] [tool.ruff] # Allow lines to be as long as 80. -extend-exclude = ["vllm_hpu/extension/**"] # NOTE(kzawora): re-enable this once extension refactor is ready +extend-exclude = ["vllm_gaudi/extension/**"] # NOTE(kzawora): re-enable this once extension refactor is ready line-length = 80 [tool.ruff.lint] @@ -89,7 +89,7 @@ follow_imports = "silent" # After fixing type errors resulting from follow_imports: "skip" -> "silent", # move the directory here and remove it from tools/mypy.sh files = [ - "vllm_hpu/*.py", + "vllm_gaudi/*.py", ] From 27e249e0485a2d8be6fa272652ad929f9a2fa759 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 11 Jul 2025 13:55:47 +0300 Subject: [PATCH 3/5] extension refactor Signed-off-by: Konrad Zawora --- .gitignore | 2 +- vllm_gaudi/attention/backends/hpu_attn.py | 14 +++++++------- vllm_gaudi/attention/ops/hpu_paged_attn.py | 2 +- vllm_gaudi/extension/bucketing/common.py | 4 ++-- vllm_gaudi/extension/bucketing/exponential.py | 2 +- vllm_gaudi/extension/bucketing/linear.py | 2 +- vllm_gaudi/extension/environment.py | 6 +++--- vllm_gaudi/extension/features.py | 4 ++-- vllm_gaudi/extension/ops.py | 2 +- vllm_gaudi/extension/profiler.py | 2 +- vllm_gaudi/extension/runtime.py | 8 ++++---- vllm_gaudi/extension/test_flags.py | 2 +- vllm_gaudi/extension/utils.py | 2 +- vllm_gaudi/ops/hpu_compressed_tensors.py | 4 ++-- vllm_gaudi/ops/hpu_fp8.py | 6 +++--- vllm_gaudi/ops/hpu_fused_moe.py | 2 +- vllm_gaudi/ops/hpu_layernorm.py | 2 +- vllm_gaudi/v1/attention/backends/hpu_attn.py | 4 ++-- vllm_gaudi/v1/worker/hpu_model_runner.py | 14 +++++++------- vllm_gaudi/v1/worker/hpu_worker.py | 6 +++--- 20 files changed, 45 insertions(+), 45 deletions(-) diff --git a/.gitignore b/.gitignore index 0f414a587f..e2f5137fd8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ # version file generated by setuptools-scm -/vllm_hpu/_version.py +/vllm_gaudi/_version.py # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index f6cb36461a..1d81c751b0 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -9,11 +9,11 @@ from typing import Any, Optional import torch -import vllm_hpu.extension.kernels as kernels -import vllm_hpu.extension.ops as ops -from vllm_hpu.extension.runtime import get_config -from vllm_hpu.extension.utils import (FP8Matmul, Matmul, ModuleFusedSDPA, - Softmax, VLLMFP8KVCache, VLLMKVCache) +import vllm_gaudi.extension.kernels as kernels +import vllm_gaudi.extension.ops as ops +from vllm_gaudi.extension.runtime import get_config +from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, ModuleFusedSDPA, + Softmax, VLLMFP8KVCache, VLLMKVCache) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, @@ -21,8 +21,8 @@ from vllm.attention.backends.mla.common import MLACommonImpl from vllm.attention.backends.utils import CommonAttentionState -from vllm_hpu.attention.ops.hpu_paged_attn import (HPUPagedAttention, - HPUPagedAttentionMetadata) +from vllm_gaudi.attention.ops.hpu_paged_attn import (HPUPagedAttention, + HPUPagedAttentionMetadata) from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm_gaudi/attention/ops/hpu_paged_attn.py b/vllm_gaudi/attention/ops/hpu_paged_attn.py index 27775a3b4d..c7df848a50 100644 --- a/vllm_gaudi/attention/ops/hpu_paged_attn.py +++ b/vllm_gaudi/attention/ops/hpu_paged_attn.py @@ -8,7 +8,7 @@ from typing import Optional import torch -from vllm_hpu.extension import cache_ops, ops +from vllm_gaudi.extension import cache_ops, ops # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py index 1729188974..c6d7e6227c 100644 --- a/vllm_gaudi/extension/bucketing/common.py +++ b/vllm_gaudi/extension/bucketing/common.py @@ -42,9 +42,9 @@ def get_bucketing_context(): use_exponential_bucketing = os.environ.get( 'VLLM_EXPONENTIAL_BUCKETING', 'true').lower() == 'true' if use_exponential_bucketing: - from vllm_hpu.extension.bucketing.exponential import ( + from vllm_gaudi.extension.bucketing.exponential import ( HPUExponentialBucketingContext as HPUBucketingContext) else: - from vllm_hpu.extension.bucketing.linear import HPUBucketingContext + from vllm_gaudi.extension.bucketing.linear import HPUBucketingContext return HPUBucketingContext \ No newline at end of file diff --git a/vllm_gaudi/extension/bucketing/exponential.py b/vllm_gaudi/extension/bucketing/exponential.py index 796fc06920..ee3e82593a 100644 --- a/vllm_gaudi/extension/bucketing/exponential.py +++ b/vllm_gaudi/extension/bucketing/exponential.py @@ -7,7 +7,7 @@ from typing import Set, Tuple from .common import WeakSingleton -from vllm_hpu.extension.runtime import get_config +from vllm_gaudi.extension.runtime import get_config logger = logging.getLogger(__name__) diff --git a/vllm_gaudi/extension/bucketing/linear.py b/vllm_gaudi/extension/bucketing/linear.py index 1bdd07d819..922529af80 100644 --- a/vllm_gaudi/extension/bucketing/linear.py +++ b/vllm_gaudi/extension/bucketing/linear.py @@ -6,7 +6,7 @@ from typing import Tuple from .common import WeakSingleton -from vllm_hpu.extension.runtime import get_config +from vllm_gaudi.extension.runtime import get_config logger = logging.getLogger(__name__) diff --git a/vllm_gaudi/extension/environment.py b/vllm_gaudi/extension/environment.py index 008d2f73e6..ba307d4db9 100644 --- a/vllm_gaudi/extension/environment.py +++ b/vllm_gaudi/extension/environment.py @@ -20,7 +20,7 @@ def _get_hw(_): return "gaudi2" case htexp.synDeviceType.synDeviceGaudi3: return "gaudi3" - from vllm_hpu.extension.utils import is_fake_hpu + from vllm_gaudi.extension.utils import is_fake_hpu if is_fake_hpu(): return "cpu" logger().warning(f'Unknown device type: {device_type}') @@ -40,7 +40,7 @@ def _get_build(_): if output.returncode == 0 and match: return match.group('version') # In cpu-test environment we don't have access to habana-torch-plugin - from vllm_hpu.extension.utils import is_fake_hpu + from vllm_gaudi.extension.utils import is_fake_hpu result = '0.0.0.0' if is_fake_hpu() else None logger().warning(f"Unable to detect habana-torch-plugin version! Returning: {result}") return result @@ -58,7 +58,7 @@ def set_vllm_config(cfg): # t.compile is very picky about what functions we can call inside modules # since this is the last step we can force recompilation of config to # ensure all values are computed before entering the model - from vllm_hpu.extension.runtime import get_config + from vllm_gaudi.extension.runtime import get_config get_config().finalize() diff --git a/vllm_gaudi/extension/features.py b/vllm_gaudi/extension/features.py index 5e43b2fa54..b2be15f3ce 100644 --- a/vllm_gaudi/extension/features.py +++ b/vllm_gaudi/extension/features.py @@ -5,11 +5,11 @@ # LICENSE file in the root directory of this source tree. ############################################################################### -from vllm_hpu.extension.config import (Not, Hardware, VersionRange, ModelType, +from vllm_gaudi.extension.config import (Not, Hardware, VersionRange, ModelType, Kernel, FirstEnabled, All, Value, Env, Disabled, Engine, choice, boolean, to_dict, split_values_and_flags) -from vllm_hpu.extension.kernels import fsdpa, block_softmax_adjustment +from vllm_gaudi.extension.kernels import fsdpa, block_softmax_adjustment def get_user_flags(): diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index ebfe65589f..bf5e577d60 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -12,7 +12,7 @@ import torch.nn.functional as F import math import habana_frameworks.torch.core as htcore -from vllm_hpu.extension.runtime import get_config +from vllm_gaudi.extension.runtime import get_config import habana_frameworks.torch.utils.experimental as htexp is_hpu_gaudi2 = htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2 diff --git a/vllm_gaudi/extension/profiler.py b/vllm_gaudi/extension/profiler.py index 808fefe3fd..a32a1ca56d 100644 --- a/vllm_gaudi/extension/profiler.py +++ b/vllm_gaudi/extension/profiler.py @@ -15,7 +15,7 @@ import uuid from habana_frameworks.torch import torch -from vllm_hpu.extension.utils import is_fake_hpu +from vllm_gaudi.extension.utils import is_fake_hpu from .logger import logger class FileWriter(threading.Thread): diff --git a/vllm_gaudi/extension/runtime.py b/vllm_gaudi/extension/runtime.py index 9db3c157a4..5123592881 100644 --- a/vllm_gaudi/extension/runtime.py +++ b/vllm_gaudi/extension/runtime.py @@ -6,10 +6,10 @@ ############################################################################### -from vllm_hpu.extension.environment import get_environment -from vllm_hpu.extension.features import get_features, get_user_flags, get_experimental_flags -from vllm_hpu.extension.config import Config -from vllm_hpu.extension.logger import logger +from vllm_gaudi.extension.environment import get_environment +from vllm_gaudi.extension.features import get_features, get_user_flags, get_experimental_flags +from vllm_gaudi.extension.config import Config +from vllm_gaudi.extension.logger import logger DETECTED = None diff --git a/vllm_gaudi/extension/test_flags.py b/vllm_gaudi/extension/test_flags.py index 03c40316c4..8b1dce568e 100644 --- a/vllm_gaudi/extension/test_flags.py +++ b/vllm_gaudi/extension/test_flags.py @@ -7,7 +7,7 @@ import os import pytest -from vllm_hpu.extension.config import VersionRange, Config, Kernel, Env, boolean, All, Not, Eq, Enabled, FirstEnabled, choice +from vllm_gaudi.extension.config import VersionRange, Config, Kernel, Env, boolean, All, Not, Eq, Enabled, FirstEnabled, choice def with_cfg(fn): diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index 3b7bb4ebea..b458eb057c 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -11,7 +11,7 @@ import habana_frameworks.torch as htorch import torch -from vllm_hpu.extension.runtime import get_config +from vllm_gaudi.extension.runtime import get_config @lru_cache(maxsize=None) diff --git a/vllm_gaudi/ops/hpu_compressed_tensors.py b/vllm_gaudi/ops/hpu_compressed_tensors.py index 7291610dc2..238a3206a0 100644 --- a/vllm_gaudi/ops/hpu_compressed_tensors.py +++ b/vllm_gaudi/ops/hpu_compressed_tensors.py @@ -19,8 +19,8 @@ compressed_tensors_moe) from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa: E501 CompressedTensorsW8A8Fp8MoEMethod) -import vllm_hpu.extension.ops as hpu_ops -from vllm_hpu.extension.ops import VllmMixtureOfExpertsOpFP8PerChannel +import vllm_gaudi.extension.ops as hpu_ops +from vllm_gaudi.extension.ops import VllmMixtureOfExpertsOpFP8PerChannel SUPPORTED_STRATEGIES = [ QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py index 9ffe22e510..8aa43c69f6 100644 --- a/vllm_gaudi/ops/hpu_fp8.py +++ b/vllm_gaudi/ops/hpu_fp8.py @@ -10,9 +10,9 @@ OrigFp8LinearMethod, Fp8MoEMethod, Fp8Config) -import vllm_hpu.extension.ops as hpu_ops -from vllm_hpu.extension.ops import (VllmMixtureOfExpertsOpFP8PerChannel, - VllmMixtureOfExpertsOpFP8) +import vllm_gaudi.extension.ops as hpu_ops +from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOpFP8PerChannel, + VllmMixtureOfExpertsOpFP8) class Fp8LinearMethod(OrigFp8LinearMethod): diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py index 4d06b1a2f5..2b115a3ba3 100644 --- a/vllm_gaudi/ops/hpu_fused_moe.py +++ b/vllm_gaudi/ops/hpu_fused_moe.py @@ -3,7 +3,7 @@ import torch from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, UnquantizedFusedMoEMethod) -from vllm_hpu.extension.ops import (VllmMixtureOfExpertsOp) +from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp) @UnquantizedFusedMoEMethod.register_oot diff --git a/vllm_gaudi/ops/hpu_layernorm.py b/vllm_gaudi/ops/hpu_layernorm.py index 5e0cb75be1..1cc4d5f9c4 100644 --- a/vllm_gaudi/ops/hpu_layernorm.py +++ b/vllm_gaudi/ops/hpu_layernorm.py @@ -12,7 +12,7 @@ def forward_oot( x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: - from vllm_hpu.extension.kernels import rms_norm + from vllm_gaudi.extension.kernels import rms_norm HPUFusedRMSNorm = rms_norm() if residual is not None: orig_shape = x.shape diff --git a/vllm_gaudi/v1/attention/backends/hpu_attn.py b/vllm_gaudi/v1/attention/backends/hpu_attn.py index e3d83cb6ff..c20bc9f895 100644 --- a/vllm_gaudi/v1/attention/backends/hpu_attn.py +++ b/vllm_gaudi/v1/attention/backends/hpu_attn.py @@ -10,8 +10,8 @@ import torch from vllm.attention.backends.abstract import AttentionMetadata -from vllm_hpu.attention.backends.hpu_attn import (HPUAttentionBackend, - HPUAttentionMetadata) +from vllm_gaudi.attention.backends.hpu_attn import (HPUAttentionBackend, + HPUAttentionMetadata) from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 9d827c5455..7d7d3a05a2 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -14,9 +14,9 @@ import numpy as np import torch import torch.distributed -import vllm_hpu.extension.environment as environment -from vllm_hpu.extension.profiler import HabanaMemoryProfiler, format_bytes -from vllm_hpu.extension.runtime import get_config +import vllm_gaudi.extension.environment as environment +from vllm_gaudi.extension.profiler import HabanaMemoryProfiler, format_bytes +from vllm_gaudi.extension.runtime import get_config from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention @@ -32,10 +32,10 @@ from vllm.model_executor.model_loader import get_model from vllm.sampling_params import SamplingType from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs -from vllm_hpu.utils import is_fake_hpu +from vllm_gaudi.utils import is_fake_hpu from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv, is_pin_memory_available) -from vllm_hpu.v1.attention.backends.hpu_attn import HPUAttentionMetadataV1 +from vllm_gaudi.v1.attention.backends.hpu_attn import HPUAttentionMetadataV1 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, @@ -43,13 +43,13 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState -from vllm_hpu.v1.worker.hpu_input_batch import InputBatch +from vllm_gaudi.v1.worker.hpu_input_batch import InputBatch from vllm.distributed.parallel_state import get_pp_group if TYPE_CHECKING: from vllm.v1.core.scheduler import SchedulerOutput -from vllm_hpu.extension.bucketing.common import get_bucketing_context +from vllm_gaudi.extension.bucketing.common import get_bucketing_context logger = init_logger(__name__) diff --git a/vllm_gaudi/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py index 87b7cdac84..df45293ece 100644 --- a/vllm_gaudi/v1/worker/hpu_worker.py +++ b/vllm_gaudi/v1/worker/hpu_worker.py @@ -9,7 +9,7 @@ import torch import torch.distributed import torch.nn as nn -from vllm_hpu.extension.profiler import HabanaMemoryProfiler, format_bytes +from vllm_gaudi.extension.profiler import HabanaMemoryProfiler, format_bytes import vllm.envs as envs from vllm.config import ParallelConfig, VllmConfig @@ -18,12 +18,12 @@ from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from vllm_hpu.utils import is_fake_hpu +from vllm_gaudi.utils import is_fake_hpu from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.utils import bind_kv_cache -from vllm_hpu.v1.worker.hpu_model_runner import HPUModelRunner, bool_helper +from vllm_gaudi.v1.worker.hpu_model_runner import HPUModelRunner, bool_helper logger = init_logger(__name__) From 313887ccf61909821d75e0e80756950b8d80e001 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 11 Jul 2025 13:56:58 +0300 Subject: [PATCH 4/5] mypy fix Signed-off-by: Konrad Zawora --- tools/mypy.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/mypy.sh b/tools/mypy.sh index 0f9c6a312d..7f21dc9aad 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -22,8 +22,8 @@ run_mypy() { run_mypy # Note that this is less strict than CI run_mypy tests -run_mypy vllm_hpu/attention -run_mypy vllm_hpu/distributed -#run_mypy vllm_hpu/extension # NOTE(kzawora): re-enable this once extension refactor is ready -run_mypy vllm_hpu/ops -run_mypy vllm_hpu/v1 +run_mypy vllm_gaudi/attention +run_mypy vllm_gaudi/distributed +#run_mypy vllm_gaudi/extension # NOTE(kzawora): re-enable this once extension refactor is ready +run_mypy vllm_gaudi/ops +run_mypy vllm_gaudi/v1 From fbd0f73f39765d04683acd78f1ed7e63b3a168c4 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 11 Jul 2025 14:04:42 +0300 Subject: [PATCH 5/5] fix leftovers Signed-off-by: Konrad Zawora --- docs/api/README.md | 2 +- setup.py | 10 +++++----- vllm_gaudi/ops/__init__.py | 12 ++++++------ vllm_gaudi/ops/hpu_fp8.py | 2 +- vllm_gaudi/platform.py | 5 +++-- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/docs/api/README.md b/docs/api/README.md index 973b8233d6..4eda6e5a3f 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -2,4 +2,4 @@ [](){ #pkg_overview } ## Full package overview -::: vllm_hpu +::: vllm_gaudi diff --git a/setup.py b/setup.py index 1bca1a9fc3..517de49e5e 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools_scm import get_version try: - VERSION = get_version(write_to="vllm_hpu/_version.py") + VERSION = get_version(write_to="vllm_gaudi/_version.py") except LookupError: # The checkout action in github action CI does not checkout the tag. It # only checks out the commit. In this case, we set a dummy version. @@ -39,12 +39,12 @@ def _read_requirements(filename: str) -> list[str]: try: requirements = _read_requirements("requirements.txt") except ValueError: - print("Failed to read requirements.txt in vllm_hpu.") + print("Failed to read requirements.txt in vllm_gaudi.") return requirements setup( - name="vllm_hpu", + name="vllm_gaudi", version=VERSION, author="Intel", long_description="Intel Gaudi plugin package for vLLM.", @@ -63,7 +63,7 @@ def _read_requirements(filename: str) -> list[str]: ext_modules=ext_modules, extras_require={}, entry_points={ - "vllm.platform_plugins": ["hpu = vllm_hpu:register"], - "vllm.general_plugins": ["hpu_custom_ops = vllm_hpu:register_ops"], + "vllm.platform_plugins": ["hpu = vllm_gaudi:register"], + "vllm.general_plugins": ["hpu_custom_ops = vllm_gaudi:register_ops"], }, ) diff --git a/vllm_gaudi/ops/__init__.py b/vllm_gaudi/ops/__init__.py index 8b983e676a..9e74fe0e50 100644 --- a/vllm_gaudi/ops/__init__.py +++ b/vllm_gaudi/ops/__init__.py @@ -1,6 +1,6 @@ -import vllm_hpu.ops.hpu_fused_moe # noqa -import vllm_hpu.ops.hpu_layernorm # noqa -import vllm_hpu.ops.hpu_lora # noqa -import vllm_hpu.ops.hpu_rotary_embedding # noqa -import vllm_hpu.ops.hpu_compressed_tensors # noqa -import vllm_hpu.ops.hpu_fp8 # noqa \ No newline at end of file +import vllm_gaudi.ops.hpu_fused_moe # noqa +import vllm_gaudi.ops.hpu_layernorm # noqa +import vllm_gaudi.ops.hpu_lora # noqa +import vllm_gaudi.ops.hpu_rotary_embedding # noqa +import vllm_gaudi.ops.hpu_compressed_tensors # noqa +import vllm_gaudi.ops.hpu_fp8 # noqa \ No newline at end of file diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py index 8aa43c69f6..ec371bba99 100644 --- a/vllm_gaudi/ops/hpu_fp8.py +++ b/vllm_gaudi/ops/hpu_fp8.py @@ -1,7 +1,7 @@ from typing import Callable, Optional import torch -from vllm_hpu import envs +from vllm_gaudi import envs from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe.layer import FusedMoE diff --git a/vllm_gaudi/platform.py b/vllm_gaudi/platform.py index ad12503659..0cc7f81340 100644 --- a/vllm_gaudi/platform.py +++ b/vllm_gaudi/platform.py @@ -37,10 +37,11 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, use_mla: bool) -> str: if use_v1 and not use_mla: logger.info("Using HPUAttentionV1 backend.") - return "vllm_hpu.attention.backends.hpu_attn.HPUAttentionBackend" + return "vllm_gaudi.attention.backends.hpu_attn.HPUAttentionBackend" if use_v1 and use_mla: logger.info("Using HPUAttentionMLA backend.") - return "vllm_hpu.attention.backends.hpu_attn.HPUMLAAttentionBackend" + return ("vllm_gaudi.attention.backends.hpu_attn." + "HPUMLAAttentionBackend") # Fall back to in-tree HPUAttention backend if use_mla: