Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=2c24bc6996cb165fce92f780b388a5e39b3f4060
VLLM_COMMIT=46f8a982b191e3a3d3a1eccaf18b184c391ac2ac
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [2c24bc6996cb165fce92f780b388a5e39b3f4060, v0.13.0]
vllm_version: [46f8a982b191e3a3d3a1eccaf18b184c391ac2ac, v0.13.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 2c24bc6996cb165fce92f780b388a5e39b3f4060
vllm: 46f8a982b191e3a3d3a1eccaf18b184c391ac2ac
changes:
runs-on: linux-aarch64-a2-0
outputs:
Expand Down Expand Up @@ -84,7 +84,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [2c24bc6996cb165fce92f780b388a5e39b3f4060, v0.13.0]
vllm_version: [46f8a982b191e3a3d3a1eccaf18b184c391ac2ac, v0.13.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -96,7 +96,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [2c24bc6996cb165fce92f780b388a5e39b3f4060, v0.13.0]
vllm_version: [46f8a982b191e3a3d3a1eccaf18b184c391ac2ac, v0.13.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [2c24bc6996cb165fce92f780b388a5e39b3f4060]
vllm_version: [46f8a982b191e3a3d3a1eccaf18b184c391ac2ac]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | 2c24bc6996cb165fce92f780b388a5e39b3f4060, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
| main | 46f8a982b191e3a3d3a1eccaf18b184c391ac2ac, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |

## Release cadence

Expand Down
6 changes: 4 additions & 2 deletions vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from vllm.logger import logger
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.utils.math_utils import cdiv, round_down
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder
from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec

from vllm_ascend import envs
Expand Down Expand Up @@ -39,16 +38,19 @@
vllm_version_is, weak_ref_tensors)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch


if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput

# isort: off
if vllm_version_is('0.13.0'):
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore
from vllm.v1.attention.backends.utils import AttentionCGSupport # type: ignore
from vllm.attention.backends.abstract import ( # type: ignore
AttentionBackend, MLAAttentionImpl)
from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore
else:
from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder
from vllm.v1.attention.backend import ( # type: ignore
AttentionBackend, AttentionCGSupport, MLAAttentionImpl)
from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore
Expand Down
5 changes: 3 additions & 2 deletions vllm_ascend/attention/sfa_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from vllm.logger import logger
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.triton_utils import HAS_TRITON
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder
from vllm.v1.kv_cache_interface import AttentionSpec

from vllm_ascend import envs
Expand Down Expand Up @@ -39,10 +38,12 @@
if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput
if vllm_version_is('0.13.0'):
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore
from vllm.v1.attention.backends.utils import AttentionCGSupport # type: ignore
from vllm.attention.backends.abstract import ( # type: ignore
AttentionBackend, MLAAttentionImpl)
else:
from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder
from vllm.v1.attention.backend import ( # type: ignore
AttentionBackend, AttentionCGSupport, MLAAttentionImpl)
# isort: on
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/patch/worker/patch_v2_egale.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import vllm
from vllm.v1.worker.gpu.input_batch import InputBatch
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.worker.gpu.spec_decode.eagle import (prepare_eagle_decode,
prepare_eagle_inputs)

Expand Down
7 changes: 6 additions & 1 deletion vllm_ascend/worker/v2/aclgraph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import torch
import torch.nn as nn
from vllm.config import VllmConfig
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.worker.gpu.block_table import BlockTables
from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
Expand All @@ -31,6 +30,12 @@
from vllm.v1.worker.gpu.input_batch import InputBuffers

from vllm_ascend.worker.v2.utils import torch_cuda_wrapper
from vllm_ascend.utils import vllm_version_is

if vllm_version_is('0.13.0'):
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
else:
from vllm.v1.attention.backend import AttentionMetadataBuilder


class AclGraphManager(CudaGraphManager):
Expand Down
7 changes: 6 additions & 1 deletion vllm_ascend/worker/v2/attn_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,18 @@
import numpy as np
import torch
from vllm.config import VllmConfig
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
from vllm.v1.kv_cache_interface import EncoderOnlyAttentionSpec, KVCacheConfig

from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
AscendPrefillContextParallelMetadata)
from vllm_ascend.utils import vllm_version_is

if vllm_version_is('0.13.0'):
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
else:
from vllm.v1.attention.backend import AttentionMetadataBuilder

_ATTENTION_MASK_BUILDER = None

Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/worker/v2/sample/penalties.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import torch
from vllm.triton_utils import tl, triton
from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
from vllm.v1.sample.metadata import SamplingMetadata


@triton.jit
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/worker/v2/sample/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import torch
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.worker.gpu.sample.min_p import apply_min_p
from vllm.v1.worker.gpu.sample.sampler import Sampler

Expand Down
Loading