Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=5af684c31912232e5c89484c2e8259e0fac6c55b
ARG VLLM_COMMIT=6f786f2c506cb07f4566771fdc62e640e2c4a176
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 5af684c31912232e5c89484c2e8259e0fac6c55b
vllm: 6f786f2c506cb07f4566771fdc62e640e2c4a176
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
Expand Down Expand Up @@ -92,7 +92,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -104,7 +104,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
203 changes: 0 additions & 203 deletions .github/workflows/schedule_test_benchmarks.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/schedule_update_estimated_time.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
name: e2e-test
strategy:
matrix:
vllm_version: [v0.18.0]
vllm_version: [v0.19.0]
type: [full, light]
uses: ./.github/workflows/_e2e_test.yaml
with:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
# CANN image tag
"cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
# vLLM commit hash for main branch
"main_vllm_commit": "5af684c31912232e5c89484c2e8259e0fac6c55b",
"main_vllm_commit": "6f786f2c506cb07f4566771fdc62e640e2c4a176",
# vLLM tag for main branch
"main_vllm_tag": "v0.19.0",
# Python version for main branch
Expand Down
2 changes: 2 additions & 0 deletions tests/e2e/singlecard/model_runner_v2/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from vllm import SamplingParams

from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import vllm_version_is

MODELS = ["Qwen/Qwen3-0.6B"]

Expand Down Expand Up @@ -63,6 +64,7 @@ def test_qwen3_dense_eager_mode(
runner.model.generate(prompts, sampling_params)


@pytest.mark.skipif(vllm_version_is("0.19.0"), reason="no need to support model_runner for v0.19.0")
@pytest.mark.parametrize("model", MAIN_MODELS)
@pytest.mark.parametrize("eagle_model", EGALE_MODELS)
@pytest.mark.parametrize("max_tokens", [32])
Expand Down
3 changes: 2 additions & 1 deletion vllm_ascend/patch/worker/patch_gdn_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
_validate_cu_seqlens,
build_chunk_meta_device,
)
from vllm_ascend.utils import is_310p

_GDN_CHUNK_SIZE = 64
# Keep this aligned with solve_tril.LARGE_BLOCK_T in ops/triton/fla/solve_tril.py.
Expand Down Expand Up @@ -596,7 +597,7 @@ def _patched_build(
return attn_metadata


if not _IS_PATCHED:
if not _IS_PATCHED and not is_310p():
gdn_attn.GDNChunkedPrefillMetadata = GDNChunkedPrefillMetadata
gdn_attn.GDNCausalConv1dHostMetadata = GDNCausalConv1dHostMetadata
gdn_attn.GDNPrefillFallbackMeta = GDNPrefillFallbackMeta
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/quantization/modelslim_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig":
return cls(config)

@classmethod
def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None:
def override_quantization_method(cls, hf_quant_cfg, user_quant, hf_config: Any = None) -> str | None:
if hf_quant_cfg is not None:
quant_method = hf_quant_cfg.get("quant_method", None)
if not quant_method and torch.npu.is_available():
Expand Down
13 changes: 10 additions & 3 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2449,7 +2449,6 @@ def _dummy_run(
if create_mixed_batch:
raise NotImplementedError("create_mixed_batch is used for warmup deepgemm, vllm-ascend does not need it")
elif uniform_decode:
assert not create_mixed_batch
num_reqs = min(max_num_reqs, cdiv(num_tokens, max_query_len))
num_scheduled_tokens_list = [max_query_len] * num_reqs
if num_tokens % max_query_len != 0:
Expand Down Expand Up @@ -3473,12 +3472,20 @@ def _check_and_update_cudagraph_mode(
with update_pass_config(self):
super()._check_and_update_cudagraph_mode(attention_backends, kv_cache_groups)


capture_descs = self.cudagraph_dispatcher.get_capture_descs()
capture_sizes = sorted({
desc.num_tokens
for _, descs in capture_descs
for desc in descs
})

# NOTE: Since aclgraph_batch_sizes cannot be determined until here,
# we set the graph params right before initializing the keys.
if self.use_aclgraph:
set_graph_params(self.cudagraph_batch_sizes)
set_graph_params(capture_sizes)
if self.speculative_config:
set_draft_graph_params(self.cudagraph_batch_sizes)
set_draft_graph_params(capture_sizes)

def capture_model(self) -> None:
gpu_model_runner_cls = next((cls for cls in self.__class__.__mro__ if cls.__name__ == "GPUModelRunner"), None)
Expand Down
2 changes: 2 additions & 0 deletions vllm_ascend/worker/v2/spec_decode/eagle/speculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def propose(
dummy_run: bool = False,
skip_attn_for_dummy_run: bool = False,
mm_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None,
is_profile: Any = None,
):
"""Override GPU EagleSpeculator.propose for Ascend NPUs,
because npu attention metadata needs more information,
Expand All @@ -92,6 +93,7 @@ def propose(
dummy_run,
skip_attn_for_dummy_run,
mm_inputs,
is_profile=is_profile,
)

def generate_draft(
Expand Down
Loading