Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=d68209402ddab3f54a09bc1f4de9a9495a283b60
VLLM_COMMIT=8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0]
vllm_version: [8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5, v0.14.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: d68209402ddab3f54a09bc1f4de9a9495a283b60
vllm: 8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5
changes:
runs-on: linux-aarch64-a2-0
outputs:
Expand Down Expand Up @@ -84,7 +84,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0]
vllm_version: [8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5, v0.14.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -96,7 +96,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0]
vllm_version: [8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5, v0.14.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60]
vllm_version: [8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | 8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5, v0.14.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |

## Release cadence

Expand Down
9 changes: 7 additions & 2 deletions tests/ut/attention/test_mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
AscendMLAPrefillMetadata,
ChunkedContextMetadata)
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.utils import vllm_version_is


class TestAscendMLABackend(TestBase):
Expand Down Expand Up @@ -226,7 +227,9 @@ def mock_parent_init(self, kv_cache_spec, layer_names, vllm_config,
)

self.parent_init_patcher = patch(
"vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__",
("vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__"
if vllm_version_is('0.14.0') else
"vllm.model_executor.layers.attention.mla_attention.MLACommonMetadataBuilder.__init__"),
mock_parent_init)
self.parent_init_patcher.start()

Expand Down Expand Up @@ -452,7 +455,9 @@ def mock_parent_init(self, kv_cache_spec, layer_names, vllm_config,
)

self.parent_init_patcher = patch(
"vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__",
("vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__"
if vllm_version_is('0.14.0') else
"vllm.model_executor.layers.attention.mla_attention.MLACommonMetadataBuilder.__init__"),
mock_parent_init)
self.parent_init_patcher.start()

Expand Down
6 changes: 4 additions & 2 deletions tests/ut/attention/test_sfa_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl,
AscendSFAMetadata,
AscendSFAMetadataBuilder)
from vllm_ascend.utils import enable_dsa_cp
from vllm_ascend.utils import enable_dsa_cp, vllm_version_is


class TestAscendSFABackend(TestBase):
Expand Down Expand Up @@ -117,7 +117,9 @@ def mock_parent_init(self, kv_cache_spec, layer_names, vllm_config,
)

self.parent_init_patcher = patch(
"vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__",
("vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__"
if vllm_version_is('0.14.0') else
"vllm.model_executor.layers.attention.mla_attention.MLACommonMetadataBuilder.__init__"),
mock_parent_init)
self.parent_init_patcher.start()

Expand Down
21 changes: 19 additions & 2 deletions tests/ut/eplb/core/test_eplb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from vllm_ascend.ascend_config import init_ascend_config
from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
from vllm_ascend.utils import vllm_version_is
# isort: on


Expand All @@ -20,8 +21,24 @@ def setUp(self, mock_fix_incompatible_config):
"refresh": True,
"eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2},
}
moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl")
moe_config = FusedMoEConfig(8, 8, 8192, 5, moe_parallel_config, torch.float16)
if vllm_version_is('0.14.0'):
moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl")
moe_config = FusedMoEConfig(8, 8, 8192, 5, moe_parallel_config, torch.float16)
else:
from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
hidden_dim=8192,
intermediate_size_per_partition=5,
num_local_experts=8,
activation="silu",
device="npu",
routing_method=RoutingMethodType.Simulated,
moe_parallel_config=moe_parallel_config,
in_dtype=torch.float16,
)
moe_config.supports_eplb = True
self.vllm_config = vllm_config
self.moe_config = moe_config
Expand Down
19 changes: 12 additions & 7 deletions tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def tearDown(self):
def test_initialization_eagle_graph(self):
self.vllm_config.speculative_config.method = "eagle"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
self.vllm_config.model_config.uses_mrope = False
Expand All @@ -65,10 +66,11 @@ def test_initialization_eagle_graph(self):
self.assertEqual(proposer.hidden_size, 4096)
self.assertTrue(proposer.use_cuda_graph)

self.assertEqual(proposer.input_ids.shape, (1024, ))
self.assertEqual(proposer.positions.shape, (1024, ))
self.assertEqual(proposer.hidden_states.shape, (1024, 4096))
self.assertEqual(proposer.arange.shape, (1024, ))
expected_max_num_tokens = proposer.max_num_tokens
self.assertEqual(proposer.input_ids.shape, (expected_max_num_tokens, ))
self.assertEqual(proposer.positions.shape, (expected_max_num_tokens, ))
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 4096))
self.assertEqual(proposer.arange.shape, (expected_max_num_tokens, ))

def test_initialization_eagle3_enforce_eager(self):
self.vllm_config.speculative_config.method = "eagle3"
Expand All @@ -83,7 +85,8 @@ def test_initialization_eagle3_enforce_eager(self):

self.assertEqual(proposer.hidden_size, 2048)
self.assertFalse(proposer.use_cuda_graph)
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
expected_max_num_tokens = proposer.max_num_tokens
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048))

def test_initialization_eagle3_full_graph_async(self):
self.vllm_config.speculative_config.method = "eagle3"
Expand All @@ -100,7 +103,8 @@ def test_initialization_eagle3_full_graph_async(self):

self.assertEqual(proposer.hidden_size, 2048)
self.assertTrue(proposer.use_cuda_graph)
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
expected_max_num_tokens = proposer.max_num_tokens
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048))

def test_initialization_mtp_full_graph_async(self):
self.vllm_config.speculative_config.method = "mtp"
Expand All @@ -117,7 +121,8 @@ def test_initialization_mtp_full_graph_async(self):

self.assertEqual(proposer.hidden_size, 2048)
self.assertFalse(proposer.use_cuda_graph)
self.assertEqual(proposer.hidden_states.shape, (1024, 2048))
expected_max_num_tokens = proposer.max_num_tokens
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048))


class TestEagleProposerLoadModel(TestBase):
Expand Down
1 change: 1 addition & 0 deletions tests/ut/spec_decode/test_mtp_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def vllm_config(self):
config.speculative_config.method = "mtp"
config.speculative_config.draft_model_config = MagicMock()
config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
config.speculative_config.draft_model_config.uses_mrope = False
config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
Expand Down
25 changes: 16 additions & 9 deletions vllm_ascend/ascend_forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
is_drafter_moe_model,
is_moe_model,
speculative_enable_dispatch_gmm_combine_decode,
vllm_version_is,
)


Expand All @@ -42,20 +43,26 @@ def set_ascend_forward_context(
batch_descriptor: BatchDescriptor | None = None,
model_instance: torch.nn.Module = None,
is_draft_model=False,
skip_compiled: bool = False,
):
"""A context manager that stores the current forward context,
can be attention metadata, etc.
We add some additional param into forward_context.
"""
with set_forward_context(
attn_metadata,
vllm_config,
virtual_engine=virtual_engine,
num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp,
cudagraph_runtime_mode=aclgraph_runtime_mode,
batch_descriptor=batch_descriptor,
):
forward_context_kwargs = {
"attn_metadata": attn_metadata,
"vllm_config": vllm_config,
"virtual_engine": virtual_engine,
"num_tokens": num_tokens,
"num_tokens_across_dp": num_tokens_across_dp,
"cudagraph_runtime_mode": aclgraph_runtime_mode,
"batch_descriptor": batch_descriptor,
}

if not vllm_version_is("0.14.0"):
forward_context_kwargs["skip_compiled"] = skip_compiled

with set_forward_context(**forward_context_kwargs):
forward_context = get_forward_context()

from vllm_ascend.ops.fused_moe.moe_comm_method import get_moe_comm_method
Expand Down
10 changes: 8 additions & 2 deletions vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from vllm.utils.math_utils import cdiv, round_down
from vllm.v1.attention.backend import ( # type: ignore
AttentionBackend, AttentionCGSupport, MLAAttentionImpl)
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder
from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore
from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec

Expand All @@ -38,12 +37,19 @@
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, maybe_trans_nz,
weak_ref_tensors)
weak_ref_tensors, vllm_version_is)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch


if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput

# isort: off
if vllm_version_is('0.14.0'):
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore
else:
from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder
# isort: on

MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024
BUILD_METADATA_STEP_PREFILL = 0
Expand Down
8 changes: 6 additions & 2 deletions vllm_ascend/attention/sfa_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from vllm.triton_utils import HAS_TRITON
from vllm.v1.attention.backend import ( # type: ignore
AttentionBackend, AttentionCGSupport, MLAAttentionImpl)
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder
from vllm.v1.kv_cache_interface import AttentionSpec

from vllm_ascend import envs
Expand All @@ -37,11 +36,16 @@
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, _round_up, dispose_layer,
enable_dsa_cp, enable_dsa_cp_with_layer_shard, maybe_trans_nz)
enable_dsa_cp, enable_dsa_cp_with_layer_shard, maybe_trans_nz, vllm_version_is)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch

if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput
if vllm_version_is('0.14.0'):
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore
else:
from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder
# isort: on

# token count limits within bmm_transpose operator
BMM_TRANS_MAX_SUPPORTED_TOKENS = 1024
Expand Down
Loading