diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
new file mode 100644
index 0000000000..5e31008138
--- /dev/null
+++ b/.github/actionlint.yaml
@@ -0,0 +1,3 @@
+self-hosted-runner:
+  labels:
+    - ucb-vllm-cicd-g2
\ No newline at end of file
diff --git a/.github/workflows/matchers/actionlint.json b/.github/workflows/matchers/actionlint.json
new file mode 100644
index 0000000000..4613e1617b
--- /dev/null
+++ b/.github/workflows/matchers/actionlint.json
@@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "actionlint",
+      "pattern": [
+        {
+          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "message": 4,
+          "code": 5
+        }
+      ]
+    }
+  ]
+}
diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json
new file mode 100644
index 0000000000..f048fce528
--- /dev/null
+++ b/.github/workflows/matchers/mypy.json
@@ -0,0 +1,16 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "mypy",
+      "pattern": [
+        {
+          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
+          "file": 1,
+          "line": 2,
+          "severity": 3,
+          "message": 4
+        }
+      ]
+    }
+  ]
+}
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000000..6ab63a4027
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,20 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+      with:
+        python-version: "3.12"
+    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
+    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      with:
+        extra_args: --all-files --hook-stage manual
diff --git a/.gitignore b/.gitignore
index 4a9f5518b4..0f414a587f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
 # version file generated by setuptools-scm
 /vllm_hpu/_version.py
 
-
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8e13940353..9224999e5e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -47,13 +47,6 @@ repos:
     types: [python]
     additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
     stages: [manual] # Don't run in CI
-  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.9
-    entry: tools/mypy.sh 1 "3.9"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
   - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.10
     entry: tools/mypy.sh 1 "3.10"
diff --git a/README.md b/README.md
index 49716e7bdf..55e0b8e397 100644
--- a/README.md
+++ b/README.md
@@ -9,8 +9,6 @@
   <img src="./docs/assets/logos/gaudi-logo.png" alt="Intel-Gaudi" width="30%">
 </p>
 
-
-
 vLLM Gaudi plugin (vllm-gaudi) integrates Intel Gaudi accelerators with vLLM to optimize large language model inference.
 
 This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162) and [[RFC]: Enhancing vLLM Plugin Architecture](https://github.com/vllm-project/vllm/issues/19161) principles, providing a modular interface for Intel Gaudi hardware.
diff --git a/docs/README.md b/docs/README.md
index ff93fbc031..c0b9069a57 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -22,4 +22,4 @@ This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-proj
 Learn more:
 
 📚 [Intel Gaudi Documentation](https://docs.habana.ai/en/v1.21.1/index.html)  
-🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html)
\ No newline at end of file
+🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html)
diff --git a/docs/api/README.md b/docs/api/README.md
index 6115cf2d38..973b8233d6 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -1,5 +1,5 @@
 # Summary
 
 [](){ #pkg_overview }
-### Full package overview
+## Full package overview
 ::: vllm_hpu
diff --git a/docs/configuration/README.md b/docs/configuration/README.md
index f8f6749f79..5f187c8c98 100644
--- a/docs/configuration/README.md
+++ b/docs/configuration/README.md
@@ -1,3 +1,3 @@
 # Configuration Options
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/configuration/long_context.md b/docs/configuration/long_context.md
index cc307b6548..0ec99f4606 100644
--- a/docs/configuration/long_context.md
+++ b/docs/configuration/long_context.md
@@ -55,4 +55,4 @@ Sequence group cmpl-3cbf19b0c6d74b3f90b5d5db2ed2385e-0 is preempted by Preemptio
 
 ## Multi-Step Scheduling Feature Usage
 
-Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details.
\ No newline at end of file
+Enabling Multi-Step Scheduling is recommended for better decode performance. Refer to vllm-project#6854 for more details.
diff --git a/docs/configuration/multi_node.md b/docs/configuration/multi_node.md
index 06692cea65..eab902e21e 100644
--- a/docs/configuration/multi_node.md
+++ b/docs/configuration/multi_node.md
@@ -61,4 +61,4 @@ Please refer to this [collection](https://github.com/HabanaAI/Gaudi-tutorials/tr
 - llama-3.1-8b-instruct_gaudi3_1.20_contextlen-2k
 - llama-3.1-8b-instruct_gaudi3_1.20_contextlen-4k
 - llama-3.3-70b-instruct_gaudi3_1.20_contextlen-2k
-- llama-3.3-70b-instruct_gaudi3_1.20_contextlen-4k
\ No newline at end of file
+- llama-3.3-70b-instruct_gaudi3_1.20_contextlen-4k
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index edf204480b..b6da15a893 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -1,3 +1,3 @@
 # Optimization and Tuning
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 312767bb95..0db0db58d6 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -3,4 +3,4 @@ title: vLLM's Plugin System
 ---
 [](){ #plugin-system }
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/dev_guide/README.md b/docs/dev_guide/README.md
index 61522d97c7..3f34007e2e 100644
--- a/docs/dev_guide/README.md
+++ b/docs/dev_guide/README.md
@@ -1,3 +1,3 @@
 # Developer Guide
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/dev_guide/ci-failures.md b/docs/dev_guide/ci-failures.md
index 3b24621834..96f897c5fb 100644
--- a/docs/dev_guide/ci-failures.md
+++ b/docs/dev_guide/ci-failures.md
@@ -1,3 +1,3 @@
 # CI Failures
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/dev_guide/profiling.md b/docs/dev_guide/profiling.md
index 406990eeec..f8795463ba 100644
--- a/docs/dev_guide/profiling.md
+++ b/docs/dev_guide/profiling.md
@@ -1,3 +1,3 @@
 # Profiling vLLM
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/features/bucketing_mechanism.md b/docs/features/bucketing_mechanism.md
index 5659d7acc7..88d1f7d82a 100644
--- a/docs/features/bucketing_mechanism.md
+++ b/docs/features/bucketing_mechanism.md
@@ -156,4 +156,4 @@ INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) u
 INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
 INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-```
\ No newline at end of file
+```
diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index 6e9226815f..7d68a744f9 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -3,4 +3,4 @@ title: Compatibility Matrix
 ---
 [](){ #compatibility-matrix }
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md
index dba5b612d0..d97a462f54 100644
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@@ -53,4 +53,4 @@ llm.llm_engine.model_executor.shutdown()
 ## Device for the Model's Weights Uploading
 
 The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution.
-This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
\ No newline at end of file
+This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md
index a6b83dd86e..8125c5365a 100644
--- a/docs/getting_started/installation.md
+++ b/docs/getting_started/installation.md
@@ -15,7 +15,6 @@ This guide provides instructions on running vLLM with Intel Gaudi devices.
     To achieve the best performance on HPU, please follow the methods outlined in the
     [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
-
 ## Quick Start Using Dockerfile
 # --8<-- [start:docker_quickstart]
 Set up the container with the latest Intel Gaudi Software Suite release using the Dockerfile.
@@ -46,12 +45,10 @@ Set up the container with the latest Intel Gaudi Software Suite release using th
 ### Environment Verification
 To verify that the Intel Gaudi software was correctly installed, run the following:
 
-```{.console}
-$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-$ pip list | grep neural # verify that neural-compressor is installed
-```
+    $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+    $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+    $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+    $ pip list | grep neural # verify that neural-compressor is installed
 
 Refer to [System Verification and Final Tests](https://docs.habana.ai/en/latest/Installation_Guide/System_Verification_and_Final_Tests.html) for more details.
 
@@ -62,10 +59,8 @@ Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Instal
 
 Use the following commands to run a Docker image. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
 
-```{.console}
-docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-```
+    docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+    docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 
 ### Build and Install vLLM
 
@@ -119,4 +114,3 @@ Currently, multiple ways are provided which can be used to install vLLM with Int
     cd vllm-hpu
     pip install -e .
     ```
-
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 0354d1d375..feaf0409ab 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -19,7 +19,6 @@ This guide will help you quickly get started with vLLM to perform:
     To achieve the best performance on HPU, please follow the methods outlined in the
     [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
-
 ## Quick Start Using Dockerfile
 
 --8<-- "docs/getting_started/installation.md:docker_quickstart"
@@ -54,4 +53,4 @@ This guide will help you quickly get started with vLLM to perform:
 
 === "OpenAI Chat Completions API with vLLM"
 
-    WIP
\ No newline at end of file
+    WIP
diff --git a/docs/user_guide/README.md b/docs/user_guide/README.md
index d3174ec0bb..72d69292c2 100644
--- a/docs/user_guide/README.md
+++ b/docs/user_guide/README.md
@@ -1,3 +1,3 @@
 # Using vLLM x Intel Gaudi
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md
index e4ad4322dd..ced0f390f1 100644
--- a/docs/user_guide/faq.md
+++ b/docs/user_guide/faq.md
@@ -3,4 +3,4 @@ title: Frequently Asked Questions
 ---
 [](){ #faq }
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
index 318558e455..44f5e49f7c 100644
--- a/docs/user_guide/metrics.md
+++ b/docs/user_guide/metrics.md
@@ -1,3 +1,3 @@
 # Metrics
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/user_guide/troubleshooting.md b/docs/user_guide/troubleshooting.md
index 1b0190e4b9..b07b113b8e 100644
--- a/docs/user_guide/troubleshooting.md
+++ b/docs/user_guide/troubleshooting.md
@@ -3,4 +3,4 @@ title: Troubleshooting
 ---
 [](){ #troubleshooting }
 
-WIP
\ No newline at end of file
+WIP
diff --git a/docs/user_guide/v1_guide.md b/docs/user_guide/v1_guide.md
index a0ccd86bc0..407f1206ee 100644
--- a/docs/user_guide/v1_guide.md
+++ b/docs/user_guide/v1_guide.md
@@ -1,3 +1,3 @@
 # vLLM V1 Support
 
-WIP
\ No newline at end of file
+WIP
diff --git a/pyproject.toml b/pyproject.toml
index 00a2a05bdf..130a3c1ae8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,6 @@ license = "Apache-2.0"
 readme = "README.md"
 description = "HPU plugin package for vLLM."
 classifiers = [
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -42,11 +41,13 @@ include = ["vllm_hpu"]
 
 [tool.yapfignore]
 ignore_patterns = [
-    "build/**",
+    "build/**", 
+    "vllm_hpu/extension/**"  # NOTE(kzawora): re-enable this once extension refactor is ready
 ]
 
 [tool.ruff]
 # Allow lines to be as long as 80.
+extend-exclude = ["vllm_hpu/extension/**"]  # NOTE(kzawora): re-enable this once extension refactor is ready
 line-length = 80
 
 [tool.ruff.lint]
@@ -79,10 +80,18 @@ ignore = [
 ]
 
 [tool.mypy]
+plugins = ['pydantic.mypy']
 ignore_missing_imports = true
+explicit_package_bases = true
 check_untyped_defs = true
 follow_imports = "silent"
 
+# After fixing type errors resulting from follow_imports: "skip" -> "silent",
+# move the directory here and remove it from tools/mypy.sh
+files = [
+    "vllm_hpu/*.py",
+]
+
 
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile, ElementE"
diff --git a/setup.py b/setup.py
index ea056549e2..1bca1a9fc3 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,5 @@
-import importlib.util
 import logging
 import os
-import subprocess
-import sys
-from sysconfig import get_paths
-from typing import Dict, List
 
 from setuptools import setup, find_packages
 from setuptools_scm import get_version
@@ -20,13 +15,15 @@
 logger = logging.getLogger(__name__)
 ext_modules = []
 
+
 def get_path(*filepath) -> str:
     return os.path.join(ROOT_DIR, *filepath)
 
-def get_requirements() -> List[str]:
+
+def get_requirements() -> list[str]:
     """Get Python package dependencies from requirements.txt."""
 
-    def _read_requirements(filename: str) -> List[str]:
+    def _read_requirements(filename: str) -> list[str]:
         with open(get_path(filename)) as f:
             requirements = f.read().strip().split("\n")
         resolved_requirements = []
@@ -44,16 +41,17 @@ def _read_requirements(filename: str) -> List[str]:
     except ValueError:
         print("Failed to read requirements.txt in vllm_hpu.")
     return requirements
-  
+
+
 setup(
     name="vllm_hpu",
     version=VERSION,
     author="Intel",
-    long_description="HPU plugin package for vLLM.",
+    long_description="Intel Gaudi plugin package for vLLM.",
     long_description_content_type="text/markdown",
-    url="https://github.com/vllm-project/vllm-hpu",
+    url="https://github.com/vllm-project/vllm-gaudi",
     project_urls={
-        "Homepage": "https://github.com/vllm-project/vllm-hpu",
+        "Homepage": "https://github.com/vllm-project/vllm-gaudi",
     },
     classifiers=[
         "Programming Language :: Python :: 3",
@@ -68,4 +66,4 @@ def _read_requirements(filename: str) -> List[str]:
         "vllm.platform_plugins": ["hpu = vllm_hpu:register"],
         "vllm.general_plugins": ["hpu_custom_ops = vllm_hpu:register_ops"],
     },
-)
\ No newline at end of file
+)
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index d86e4d3842..6a1f07b1f2 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -120,7 +120,6 @@ def launch_simple(eval_config):
     generated_text = ""
     for output in outputs:
         generated_text += output.outputs[0].text
-    found_countries = []
 
     european_countries = [
         "Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus",
@@ -134,7 +133,7 @@ def launch_simple(eval_config):
         "Spain", "Sweden", "Switzerland", "Turkey", "Ukraine",
         "United Kingdom", "Vatican City"
     ]
-    found_countries = []
+    found_countries: list[str] = []
     for country in european_countries:
         if country in generated_text:
             found_countries.append(country)
diff --git a/tools/mypy.sh b/tools/mypy.sh
index 9fc69dafcd..0f9c6a312d 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -21,8 +21,9 @@ run_mypy() {
 }
 
 run_mypy # Note that this is less strict than CI
+run_mypy tests
 run_mypy vllm_hpu/attention
 run_mypy vllm_hpu/distributed
+#run_mypy vllm_hpu/extension # NOTE(kzawora): re-enable this once extension refactor is ready
 run_mypy vllm_hpu/ops
-run_mypy vllm_hpu/worker
 run_mypy vllm_hpu/v1
diff --git a/vllm_hpu/__init__.py b/vllm_hpu/__init__.py
index 700c632b9f..1fe7cdcf14 100644
--- a/vllm_hpu/__init__.py
+++ b/vllm_hpu/__init__.py
@@ -5,7 +5,8 @@
 def register():
     """Register the HPU platform."""
     HpuPlatform.set_torch_compile()
-    if os.getenv("VLLM_WEIGHT_LOAD_FORCE_SYNC", "false").lower() in ("true", "1"):
+    if os.getenv("VLLM_WEIGHT_LOAD_FORCE_SYNC",
+                 "false").lower() in ("true", "1"):
         HpuPlatform.set_synchronized_weight_loader()
     return "vllm_hpu.platform.HpuPlatform"
 
diff --git a/vllm_hpu/attention/backends/hpu_attn.py b/vllm_hpu/attention/backends/hpu_attn.py
index 0a11485338..f6cb36461a 100644
--- a/vllm_hpu/attention/backends/hpu_attn.py
+++ b/vllm_hpu/attention/backends/hpu_attn.py
@@ -279,6 +279,8 @@ def _forward_prefill(  # type: ignore
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
         if not self.use_merged_prefill:
+            assert attn_metadata.seq_lens_tensor is not None, \
+                "seq_lens_tensor must be provided for prefill attention"
             batch_size = attn_metadata.seq_lens_tensor.shape[0]
         else:
             batch_size = 1
@@ -492,7 +494,7 @@ def forward(
         attn_metadata: HPUAttentionMetadata,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        """Forward pass with xFormers and PagedAttention.
+        """Forward pass with PagedAttention.
 
         Args:
             query: shape = [num_tokens, num_heads * head_size]
@@ -522,6 +524,8 @@ def forward(
                 else:
                     batch_size = 1
             else:
+                assert attn_metadata.block_mapping is not None, \
+                    "seq_lens_tensor must be provided for attention"
                 batch_size = attn_metadata.block_mapping.shape[1]
             num_tokens, hidden_size = query.shape
             seq_len = num_tokens // batch_size
diff --git a/vllm_hpu/attention/ops/hpu_paged_attn.py b/vllm_hpu/attention/ops/hpu_paged_attn.py
index 6bc608022a..27775a3b4d 100644
--- a/vllm_hpu/attention/ops/hpu_paged_attn.py
+++ b/vllm_hpu/attention/ops/hpu_paged_attn.py
@@ -5,7 +5,7 @@
 ###############################################################################
 
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import torch
 from vllm_hpu.extension import cache_ops, ops
@@ -27,7 +27,7 @@ class HPUPagedAttentionMetadata:
 class HPUPagedAttention:
 
     @staticmethod
-    def get_supported_head_sizes() -> List[int]:
+    def get_supported_head_sizes() -> list[int]:
         return list(range(1, 257))
 
     @staticmethod
@@ -36,7 +36,7 @@ def get_kv_cache_shape(
         block_size: int,
         num_kv_heads: int,
         head_size: int,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (num_blocks * block_size, num_kv_heads, head_size)
 
     @staticmethod
@@ -44,7 +44,7 @@ def split_kv_cache(
         kv_cache: torch.Tensor,
         num_kv_heads: int,
         head_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         key_cache = kv_cache[0]
         value_cache = kv_cache[1]
         return key_cache, value_cache
@@ -66,8 +66,8 @@ def forward_decode(**kwargs) -> torch.Tensor:
 
     @staticmethod
     def swap_blocks(
-        src_kv_cache: Tuple[torch.Tensor, torch.Tensor],
-        dst_kv_cache: Tuple[torch.Tensor, torch.Tensor],
+        src_kv_cache: tuple[torch.Tensor, torch.Tensor],
+        dst_kv_cache: tuple[torch.Tensor, torch.Tensor],
         src_to_dsts: torch.Tensor,
     ) -> None:
         src_key_cache = src_kv_cache[0]
@@ -80,7 +80,7 @@ def swap_blocks(
 
     @staticmethod
     def copy_blocks(
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
         src_to_dsts: torch.Tensor,
     ) -> None:
         key_caches = [kv_cache[0] for kv_cache in kv_caches]
diff --git a/vllm_hpu/distributed/device_communicators/hpu_communicator.py b/vllm_hpu/distributed/device_communicators/hpu_communicator.py
index 34b706c5ef..6bdaa43b2a 100644
--- a/vllm_hpu/distributed/device_communicators/hpu_communicator.py
+++ b/vllm_hpu/distributed/device_communicators/hpu_communicator.py
@@ -3,9 +3,8 @@
 import torch
 import torch.distributed as dist
 
-from vllm.platforms import current_platform
-
-from vllm.distributed.device_communicators.base_device_communicator import DeviceCommunicatorBase
+from vllm.distributed.device_communicators.base_device_communicator \
+    import DeviceCommunicatorBase
 
 import habana_frameworks.torch as htorch  # noqa: F401
 
@@ -41,4 +40,4 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
                                               (world_size *
                                                input_size[dim], ) +
                                               input_size[dim + 1:])
-        return output_tensor
\ No newline at end of file
+        return output_tensor
diff --git a/vllm_hpu/ops/hpu_lora.py b/vllm_hpu/ops/hpu_lora.py
index e106a117d2..9d254ff1c3 100644
--- a/vllm_hpu/ops/hpu_lora.py
+++ b/vllm_hpu/ops/hpu_lora.py
@@ -1,10 +1,11 @@
 import torch
 import torch.nn.functional as F
 from vllm.model_executor.custom_op import CustomOp
+from vllm.lora.layers import VocabParallelEmbeddingWithLoRA
 
 
 @CustomOp.register_oot(name='VocabParallelEmbeddingWithLoRA')
-class HPUVocabParallelEmbeddingWithLoRA:
+class HPUVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA):
 
     def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
         # x need to reshaped into 2d as batch is there
diff --git a/vllm_hpu/ops/hpu_rotary_embedding.py b/vllm_hpu/ops/hpu_rotary_embedding.py
index cf64214155..14e426e7e4 100644
--- a/vllm_hpu/ops/hpu_rotary_embedding.py
+++ b/vllm_hpu/ops/hpu_rotary_embedding.py
@@ -669,7 +669,7 @@ def forward_oot(  # type: ignore[override]
         key: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Ensure the cache is on the right device.
-        self.cos_sin_cache = self.cos_sin_cache.to(query.device)
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
         cos_cache, sin_cache = self.cos_sin_cache.chunk(2, dim=-1)
         # shape: [577, 1, 44]
 
diff --git a/vllm_hpu/platform.py b/vllm_hpu/platform.py
index b790f66897..f89ca3e16e 100644
--- a/vllm_hpu/platform.py
+++ b/vllm_hpu/platform.py
@@ -144,7 +144,7 @@ def set_torch_compile(cls) -> None:
             torch._dynamo.config.disable = True
             # NOTE multi-HPU inference with HPUGraphs (lazy-only)
             # requires enabling lazy collectives
-            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
+            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html  # noqa: E501
             os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
 
     @classmethod
@@ -156,12 +156,13 @@ def set_weight_attrs(
         ):
             """Set attributes on a weight tensor.
 
-            This method is used to set attributes on a weight tensor. This method
-            will not overwrite existing attributes.
+            This method is used to set attributes on a weight tensor.
+            This method will not overwrite existing attributes.
 
             Args:
                 weight: The weight tensor.
-                weight_attrs: A dictionary of attributes to set on the weight tensor.
+                weight_attrs: A dictionary of attributes to set on the weight
+                    tensor.
             """
             if weight_attrs is None:
                 return
@@ -169,14 +170,18 @@ def set_weight_attrs(
                 assert not hasattr(weight, key), (
                     f"Overwriting existing tensor attribute: {key}")
 
-                # NOTE(woosuk): During weight loading, we often do something like:
+                # NOTE(woosuk): During weight loading, we often do something
+                # like:
                 # narrowed_tensor = param.data.narrow(0, offset, len)
                 # narrowed_tensor.copy_(real_weight)
-                # expecting narrowed_tensor and param.data to share the same storage.
-                # However, on TPUs, narrowed_tensor will lazily propagate to the base
-                # tensor, which is param.data, leading to the redundant memory usage.
-                # This sometimes causes OOM errors during model loading. To avoid this,
-                # we sync the param tensor after its weight loader is called.
+                # expecting narrowed_tensor and param.data to share the same
+                # storage.
+                # However, on TPUs, narrowed_tensor will lazily propagate to
+                # the base tensor, which is param.data, leading to the
+                # redundant memory usage.
+                # This sometimes causes OOM errors during model loading. To
+                # avoid this, we sync the param tensor after its weight loader
+                # is called.
                 # TODO(woosuk): Remove this hack once we have a better solution.
                 # NOTE(ksmusz): Issue seen in HPU also, same hack applied.
                 if key == "weight_loader":
diff --git a/vllm_hpu/utils.py b/vllm_hpu/utils.py
index 740db06484..5080d68143 100644
--- a/vllm_hpu/utils.py
+++ b/vllm_hpu/utils.py
@@ -1,9 +1,7 @@
 from functools import cache
 import os
 from vllm.utils import make_tensor_with_pad, TORCH_DTYPE_TO_NUMPY_DTYPE
-from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Sequence, Tuple, Type, TypeVar, Union, cast,
-                    overload)
+from typing import (Optional, TypeVar, Union)
 import torch
 import numpy as np
 import numpy.typing as npt
@@ -12,6 +10,7 @@
 T = TypeVar("T")
 U = TypeVar("U")
 
+
 @cache
 def is_fake_hpu() -> bool:
     return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0'
@@ -109,4 +108,3 @@ def make_tensor_with_pad_align(
         tensor = tensor.pin_memory()
 
     return tensor
-
diff --git a/vllm_hpu/v1/worker/hpu_model_runner.py b/vllm_hpu/v1/worker/hpu_model_runner.py
index fe66b1df7f..9d827c5455 100644
--- a/vllm_hpu/v1/worker/hpu_model_runner.py
+++ b/vllm_hpu/v1/worker/hpu_model_runner.py
@@ -814,7 +814,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             self.input_batch.num_computed_tokens_cpu[req_index] = (
                 num_computed_tokens)
             self.input_batch.block_table.append_row(new_block_ids, req_index)
-            
+
             # For the last rank, we don't need to update the token_ids_cpu
             # because the sampled tokens are already cached.
             if not is_last_rank:
@@ -822,17 +822,21 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
                 start_token_index = num_computed_tokens
                 end_token_index = num_computed_tokens + len(new_token_ids)
                 self.input_batch.token_ids_cpu[
-                    req_index, start_token_index:end_token_index] = new_token_ids
-                self.input_batch.num_tokens_no_spec[req_index] = end_token_index
+                    req_index,
+                    start_token_index:end_token_index] = new_token_ids
+                self.input_batch.num_tokens_no_spec[
+                    req_index] = end_token_index
                 # Add spec_token_ids to token_ids_cpu.
-                spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                    req_id, ())
+                spec_token_ids = \
+                    scheduler_output.scheduled_spec_decode_tokens.get(
+                        req_id, ())
                 if spec_token_ids:
                     start_index = end_token_index
                     end_token_index += len(spec_token_ids)
                     self.input_batch.token_ids_cpu[
-                        req_index, start_index:end_token_index] = spec_token_ids
-                # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
+                        req_index,
+                        start_index:end_token_index] = spec_token_ids
+                # NOTE(woosuk): `num_tokens` here may include spec decode tokens
                 self.input_batch.num_tokens[req_index] = end_token_index
 
         # Check if the batch has changed. If not, we can skip copying the
@@ -1669,7 +1673,8 @@ def execute_model(
         # NOTE(woosuk): As an exception, when using PP, the scheduler sends
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
-        for req_idx, sampled_ids in enumerate(postprocessed_sampled_token_ids[:num_reqs]):
+        for req_idx, sampled_ids in enumerate(
+                postprocessed_sampled_token_ids[:num_reqs]):
             if not sampled_ids:
                 continue