Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1]
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand All @@ -102,7 +102,7 @@ jobs:
strategy:
fail-fast: false
matrix:
vllm_version: [v0.19.1]
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
needs: [parse-trigger]
if: ${{ needs.parse-trigger.outputs.allowed == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1]
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
uses: ./.github/workflows/_optional_smart_e2e.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -164,7 +164,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1]
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_update_estimated_time.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
name: e2e-test
strategy:
matrix:
vllm_version: [v0.19.1]
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
type: [full, light]
uses: ./.github/workflows/_e2e_test.yaml
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_vllm_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
fail-fast: false
matrix:
part: [0, 1, 2, 3]
vllm: [v0.19.1]
vllm: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11
env:
Expand Down
8 changes: 6 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.19.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 6 additions & 2 deletions Dockerfile.310p
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.19.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 6 additions & 2 deletions Dockerfile.310p.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.19.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 6 additions & 2 deletions Dockerfile.a3
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.19.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 6 additions & 2 deletions Dockerfile.a3.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.19.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 6 additions & 2 deletions Dockerfile.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.19.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | Triton Ascend |
|-------------|--------------|------------------|-------------|--------------------|---------------|
| main | {{main_vllm_commit}}, {{main_vllm_tag}} | {{main_python_version}} | {{main_cann_version}} | {{main_pytorch_torch_npu_version}} | {{main_triton_ascend_version}} |
| main | {{main_vllm_commit}} | {{main_python_version}} | {{main_cann_version}} | {{main_pytorch_torch_npu_version}} | {{main_triton_ascend_version}} |

## Release cadence

Expand Down
4 changes: 0 additions & 4 deletions tests/e2e/singlecard/model_runner_v2/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,13 @@
from vllm import SamplingParams

from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import vllm_version_is

MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]

MAIN_MODELS = ["LLM-Research/Meta-Llama-3.1-8B-Instruct"]
EGALE_MODELS = ["vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"]


@pytest.mark.skipif(vllm_version_is("0.19.1"), reason="no need to support model_runner for v0.19.1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("enforce_eager", [True])
Expand Down Expand Up @@ -65,7 +63,6 @@ def test_qwen3_dense_eager_mode(
runner.model.generate(prompts, sampling_params)


@pytest.mark.skipif(vllm_version_is("0.19.1"), reason="no need to support model_runner for v0.19.1")
@pytest.mark.parametrize("model", MAIN_MODELS)
@pytest.mark.parametrize("eagle_model", EGALE_MODELS)
@pytest.mark.parametrize("max_tokens", [32])
Expand Down Expand Up @@ -104,7 +101,6 @@ def test_egale_spec_decoding(
runner.model.generate(prompts, sampling_params)


@pytest.mark.skipif(vllm_version_is("0.19.1"), reason="no need to support model_runner for v0.19.1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("enforce_eager", [False])
Expand Down
5 changes: 0 additions & 5 deletions tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,11 +625,6 @@ def test_dflash_acceptance(
method: str,
num_speculative_tokens: int,
):
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.19.1"):
pytest.skip("Dflash tests are not supported on vLLM version 0.19.1")

main_model_name = DFLASH[method]["main"]
spec_model_name = DFLASH[method]["spec"]

Expand Down
13 changes: 2 additions & 11 deletions tests/ut/ops/test_mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from tests.ut.base import TestBase
from vllm_ascend.ops.mla import AscendMultiHeadLatentAttention, IndexerWrapper
from vllm_ascend.utils import vllm_version_is


class TestIndexerWrapper(TestBase):
Expand All @@ -19,11 +18,7 @@ def test_initialization(self):
mock_indexer.topk_tokens = 2048
mock_indexer.q_lora_rank = 1536
mock_indexer.wq_b = nn.Linear(128, 128)
if vllm_version_is("0.19.1"):
mock_indexer.wk = nn.Linear(128, 128)
mock_indexer.weights_proj = nn.Linear(128, 128)
else:
mock_indexer.wk_weights_proj = nn.Linear(128, 128)
mock_indexer.wk_weights_proj = nn.Linear(128, 128)
mock_indexer.k_norm = nn.LayerNorm(128)
mock_indexer.softmax_scale = 0.123
mock_indexer.topk_indices_buffer = torch.randn(10)
Expand All @@ -36,11 +31,7 @@ def test_initialization(self):
self.assertEqual(wrapper.topk_tokens, 2048)
self.assertEqual(wrapper.q_lora_rank, 1536)
self.assertIs(wrapper.wq_b, mock_indexer.wq_b)
if vllm_version_is("0.19.1"):
self.assertIs(wrapper.wk, mock_indexer.wk)
self.assertIs(wrapper.weights_proj, mock_indexer.weights_proj)
else:
self.assertIs(wrapper.wk_weights_proj, mock_indexer.wk_weights_proj)
self.assertIs(wrapper.wk_weights_proj, mock_indexer.wk_weights_proj)
self.assertIs(wrapper.k_norm, mock_indexer.k_norm)
self.assertEqual(wrapper.softmax_scale, 0.123)

Expand Down
11 changes: 4 additions & 7 deletions vllm_ascend/_310p/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input
from vllm_ascend.quantization.quant_type import QuantType
from vllm_ascend.utils import vllm_version_is

from .experts_selector import select_experts
from .moe_comm_method import AllGatherCommImpl310
Expand Down Expand Up @@ -164,14 +163,13 @@ def __init__(self, *args, **kwargs):

from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner

is_legacy = vllm_version_is("0.19.1")
self.runner = AscendMoERunner(
self if is_legacy else self.layer_name,
self.layer_name,
self.moe_config,
self.router,
self._routed_input_transform,
self.gate if is_legacy else kwargs.pop("gate", None),
self.shared_experts if is_legacy else kwargs.pop("shared_experts", None),
kwargs.pop("gate", None),
kwargs.pop("shared_experts", None),
self.quant_method,
self.reduce_results,
self.vllm_config.parallel_config.enable_dbo,
Expand Down Expand Up @@ -285,9 +283,8 @@ def __init__(
# which at this point is still the stale runner built with shared_experts=None.
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner

is_legacy = vllm_version_is("0.19.1")
self.runner = AscendMoERunner(
self if is_legacy else self.layer_name,
self.layer_name,
self.moe_config,
self.router,
self._routed_input_transform,
Expand Down
6 changes: 1 addition & 5 deletions vllm_ascend/ascend_forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
is_drafter_moe_model,
is_moe_model,
speculative_enable_dispatch_gmm_combine_decode,
vllm_version_is,
)


Expand Down Expand Up @@ -156,10 +155,7 @@ def set_ascend_forward_context(
dp_world_size = get_dp_group().world_size
if dp_world_size > 1 and forward_context.dp_metadata is not None:
dp_meta = forward_context.dp_metadata
if vllm_version_is("0.19.1"):
max_tokens_across_dp = dp_meta.max_tokens_across_dp_cpu.item()
else:
max_tokens_across_dp = dp_meta.num_tokens_across_dp_cpu.max().item()
max_tokens_across_dp = dp_meta.num_tokens_across_dp_cpu.max().item()
if forward_context.flash_comm_v1_enabled or forward_context.flashcomm_v2_enabled:
padded_length = (max_tokens_across_dp + tp_world_size - 1) // tp_world_size * tp_world_size
pad_size = padded_length - num_tokens
Expand Down
9 changes: 2 additions & 7 deletions vllm_ascend/attention/context_parallel/sfa_cp.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from vllm_ascend.attention.sfa_v1 import AscendSFAImpl, AscendSFAMetadata, AscendSFAMetadataBuilder
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, enabling_mlapo, split_decodes_and_prefills
from vllm_ascend.ops.triton.rope import rope_forward_triton_siso
from vllm_ascend.utils import vllm_version_is

M = TypeVar("M", bound=AscendSFAMetadata)

Expand Down Expand Up @@ -414,12 +413,8 @@ def indexer_select_post_process(
actual_seq_lengths_query: torch.Tensor,
actual_seq_lengths_key: torch.Tensor,
):
if vllm_version_is("0.19.1"):
weights, _ = self.weights_proj(x)
else:
kw, _ = self.wk_weights_proj(x)
weights = kw[:, self.head_dim :]

kw, _ = self.wk_weights_proj(x)
weights = kw[:, self.head_dim :]
q_li, _ = self.wq_b(q_c) # [b,s,1536] @ [1536,64*128] = [b,s,64*128]
q_li = q_li.view(-1, self.n_head, self.head_dim) # [n_toks,64,128]
if HAS_TRITON:
Expand Down
22 changes: 5 additions & 17 deletions vllm_ascend/attention/sfa_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
enable_dsa_cp_with_o_proj_tp,
get_weight_prefetch_method,
maybe_trans_nz,
vllm_version_is,
)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch

Expand Down Expand Up @@ -439,11 +438,7 @@ def __init__(
self.n_head: int = self.indexer.n_head # 64
self.head_dim: int = self.indexer.head_dim # 128
self.wq_b = self.indexer.wq_b
if vllm_version_is("0.19.1"):
self.wk = self.indexer.wk
self.weights_proj = self.indexer.weights_proj
else:
self.wk_weights_proj = self.indexer.wk_weights_proj
self.wk_weights_proj = self.indexer.wk_weights_proj
self.k_norm = self.indexer.k_norm
self.cp_size = 1
self.is_rope_neox_style = True
Expand Down Expand Up @@ -912,11 +907,8 @@ def indexer_select_pre_process(
cos: torch.Tensor,
sin: torch.Tensor,
):
if vllm_version_is("0.19.1"):
k_li, _ = self.wk(x) # [b,s,7168] @ [7168,128] = [b,s,128]
else:
kw, _ = self.wk_weights_proj(x)
k_li = kw[:, : self.head_dim]
kw, _ = self.wk_weights_proj(x)
k_li = kw[:, : self.head_dim]
k_li = self.k_norm(k_li).unsqueeze(1)
k_li = k_li.view(-1, 1, self.head_dim)

Expand Down Expand Up @@ -961,12 +953,8 @@ def indexer_select_post_process(
actual_seq_lengths_query: torch.Tensor,
actual_seq_lengths_key: torch.Tensor,
):
if vllm_version_is("0.19.1"):
weights, _ = self.weights_proj(x)
else:
kw, _ = self.wk_weights_proj(x)
weights = kw[:, self.head_dim :]

kw, _ = self.wk_weights_proj(x)
weights = kw[:, self.head_dim :]
q_li, _ = self.wq_b(q_c) # [b,s,1536] @ [1536,64*128] = [b,s,64*128]
q_li = q_li.view(-1, self.n_head, self.head_dim) # [n_toks,64,128]
if HAS_TRITON:
Expand Down
Loading
Loading