Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_pr_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0]
vllm_version: [b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/vllm_ascend_test_pr_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
vllm: b75f826fca4febb17a76c12a45d5e315111c7618
changes:
runs-on: linux-aarch64-a2-0
outputs:
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
SOC_VERSION: ascend910b1
strategy:
matrix:
vllm_version: [ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0]
vllm_version: [b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0]

steps:
- name: Free up disk space
Expand Down Expand Up @@ -154,7 +154,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0]
vllm_version: [b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ The table below is the release compatibility matrix for vLLM Ascend release.
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
| main | b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |

## Release cadence

Expand Down
1 change: 1 addition & 0 deletions tests/e2e/multicard/test_shared_expert_dp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

MODELS = [
"deepseek-ai/DeepSeek-V2-Lite",
"deepseek-ai/DeepSeek-V2-Lite",
]
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

Expand Down
7 changes: 5 additions & 2 deletions tests/ut/compilation/test_acl_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,9 @@ def test_update_mla_dcp_pcp_params(self, _mock_graph_task_end):
(q_nope, q_pe, k_nope, k_pe, block_table, seq_lens, num_heads,
scale, num_kv_heads, out, lse))

update_mla_attn_dcp_pcp_params(self.update_stream, forward_context, 4)
with patch("torch_npu._C._npu_setStream", return_value=None):
update_mla_attn_dcp_pcp_params(self.update_stream, forward_context,
4)

_mock_graph_task_end.assert_called_once()

Expand Down Expand Up @@ -842,6 +844,7 @@ def test_update_attn_dcp_pcp_params(self, _mock_graph_task_end):
block_table, 128, actual_seq_lengths_kv, actual_seq_lengths_q,
out, lse, 2, 0, 0))

update_attn_dcp_pcp_params(self.update_stream, forward_context, 4)
with patch("torch_npu._C._npu_setStream", return_value=None):
update_attn_dcp_pcp_params(self.update_stream, forward_context, 4)

_mock_graph_task_end.assert_called_once()
4 changes: 4 additions & 0 deletions tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ def test_load_model_pp1(self, mock_pp_group, mock_get_model,
mock_model = MagicMock()
mock_model.model.embed_tokens = MagicMock()
mock_model.lm_head = MagicMock()
mock_model.multimodal_cpu_fields = None
mock_model.merge_by_field_config = None
mock_get_model.return_value = MagicMock()
self.proposer.name = SpecDcodeType.EAGLE

Expand All @@ -117,6 +119,8 @@ def test_load_model_pp_gt1(self, mock_pp_group, mock_get_model,

mock_model = MagicMock()
original_embed = MagicMock()
mock_model.multimodal_cpu_fields = None
mock_model.merge_by_field_config = None
mock_get_model.return_value = MagicMock(model=MagicMock(
embed_tokens=original_embed))

Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/eplb/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@


def get_expert_map(self, layer_id):
return self.model.layers[layer_id].mlp.experts.get_map()
return self.model.layers[layer_id].mlp.experts.expert_map


def get_log2phy_map(self, layer_id):
Expand Down
26 changes: 16 additions & 10 deletions vllm_ascend/ops/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def __init__(self, *args, **kwargs):
AscendFusedMoE.moe_counter += 1
self.moe_instance_id = AscendFusedMoE.moe_counter

self.expert_map = None
self._expert_map = None
self.log2phy = None

if self.quant_config is None:
Expand All @@ -172,7 +172,7 @@ def __init__(self, *args, **kwargs):
dtype=vllm_config.model_config.dtype)

# init moe.
self.local_num_experts, self.expert_map, _ = determine_expert_map(
self.local_num_experts, self._expert_map, _ = determine_expert_map(
self.ep_size, self.ep_rank, self.global_num_experts)
# TODO: Temporary flag to indicate if static EPLB is enabled. This is a
# workaround to bypass a quantization check that fails with float weights.
Expand All @@ -188,7 +188,7 @@ def __init__(self, *args, **kwargs):
self.expert_load_balancer.get_global_redundant_expert_num())
self.global_num_experts = num_experts + self.global_redundant_expert_num
try:
self.local_num_experts, self.expert_map = (
self.local_num_experts, self._expert_map = (
self.expert_load_balancer.get_rank_placement_map(
self.moe_instance_id, self.ep_rank))
self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
Expand All @@ -204,16 +204,16 @@ def __init__(self, *args, **kwargs):
if self.dynamic_eplb:
self.log2phy = determine_default_log2phy_map(
self.global_num_experts, self.ep_size, self.ep_rank).npu()
if self.expert_map is not None and isinstance(self.expert_map,
torch.Tensor):
if self._expert_map is not None and isinstance(self._expert_map,
torch.Tensor):
logger.info_once(
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
" number of experts: %s/%s. Experts local to global index map:"
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
self.global_num_experts,
get_compressed_expert_map(self.expert_map))
get_compressed_expert_map(self._expert_map))
local_num_experts = (torch.sum(
self.expert_map != -1) if self.expert_map is not None else
self._expert_map != -1) if self._expert_map is not None else
self.global_num_experts)
if self.dynamic_eplb:
self.moe_load = torch.zeros(local_num_experts,
Expand Down Expand Up @@ -264,10 +264,16 @@ def _get_quant_type(self) -> QuantType:
return QuantType.NONE

def update_expert_map(self, new_expert_map):
self.expert_map = new_expert_map
self._expert_map = new_expert_map

def get_map(self):
return self.expert_map
@property
def expert_map(self) -> torch.Tensor | None:
return self._expert_map

@expert_map.setter
def expert_map(self, new_expert_map):
# TODO(Potabk): Remove this once we drop vllm v0.12.0(This makes backward compatibility with vllm v0.12.0)
self._expert_map = new_expert_map

def get_log2phy_map(self):
return self.log2phy
Expand Down
7 changes: 6 additions & 1 deletion vllm_ascend/patch/platform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,15 @@
import os

import vllm_ascend.patch.platform.patch_distributed # noqa
import vllm_ascend.patch.platform.patch_ec_connector # noqa
import vllm_ascend.patch.platform.patch_mamba_config # noqa
import vllm_ascend.patch.platform.patch_sched_yield # noqa
from vllm_ascend.utils import vllm_version_is

if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv(
"EXPERT_MAP_RECORD", "false") == "true":
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa

if vllm_version_is("0.12.0"):
import vllm_ascend.patch.platform.patch_ec_connector012 # noqa
else:
import vllm_ascend.patch.platform.patch_ec_connector # noqa
13 changes: 6 additions & 7 deletions vllm_ascend/patch/platform/patch_ec_connector.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector
import vllm.distributed.ec_transfer.ec_connector.example_connector
from safetensors.torch import load_file
from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (
ECSharedStorageConnector, ECSharedStorageConnectorMetadata)
from vllm.distributed.ec_transfer.ec_connector.example_connector import (
ECConnectorMetadata, ECExampleConnector)
from vllm.logger import logger


class AscendECSharedStorageConnector(ECSharedStorageConnector):
class AscendECExampleConnector(ECExampleConnector):

def start_load_caches(self, encoder_cache, **kwargs) -> None:
metadata: ECConnectorMetadata = self._get_connector_metadata()
assert isinstance(metadata, ECSharedStorageConnectorMetadata)
assert isinstance(metadata, ECConnectorMetadata)
assert encoder_cache is not None
if metadata is None:
logger.warning((
Expand All @@ -29,4 +28,4 @@ def start_load_caches(self, encoder_cache, **kwargs) -> None:
mm_data.mm_hash)


vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector
vllm.distributed.ec_transfer.ec_connector.example_connector.ECExampleConnector = AscendECExampleConnector
33 changes: 33 additions & 0 deletions vllm_ascend/patch/platform/patch_ec_connector012.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector # type: ignore[import-not-found] # noqa
from safetensors.torch import load_file
from vllm.distributed.ec_transfer.ec_connector.base import \
ECConnectorMetadata # type: ignore[import-not-found] # noqa
from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import ( # type: ignore[import-not-found] # noqa
ECSharedStorageConnector, ECSharedStorageConnectorMetadata)
from vllm.logger import logger


class AscendECSharedStorageConnector(ECSharedStorageConnector):

def start_load_caches(self, encoder_cache, **kwargs) -> None:
metadata: ECConnectorMetadata = self._get_connector_metadata()
assert isinstance(metadata, ECSharedStorageConnectorMetadata)
assert encoder_cache is not None
if metadata is None:
logger.warning((
"In connector.start_load_caches, ",
"but the connector metadata is None",
))
return
# Load the EC for each mm data
for mm_data in metadata.mm_datas:
if mm_data.mm_hash in encoder_cache:
continue
filename = self._generate_filename_debug(mm_data.mm_hash)
ec_cache = load_file(filename)["ec_cache"].npu()
encoder_cache[mm_data.mm_hash] = ec_cache
logger.debug("Success load encoder cache for hash %s",
mm_data.mm_hash)


vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector
4 changes: 4 additions & 0 deletions vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,10 @@ def get_attn_backend_cls(
use_mla,
has_sink=False,
use_sparse=False,
# NOTE: Please pay special attention to the order of these parameters.
# Although we are only using some of them so far
# vllm passes them in sequence when using this interface.
use_mm_prefix: bool = False,
attn_type: str | None = None,
):
# choose attention backend based on use_mla
Expand Down
Empty file added vllm_ascend/pool/__init__.py
Empty file.
11 changes: 11 additions & 0 deletions vllm_ascend/pool/medatata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import torch


class PoolingStates:
# NOTE: This should be removed after we drop support of vLLM v0.12.0
def __init__(self):
# for chunked prefill with ALL pooling
self.hidden_states_cache: list[torch.Tensor] = []

def clean(self):
self.hidden_states_cache.clear()
7 changes: 4 additions & 3 deletions vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,9 +475,10 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:

# Calculate maximum supported batch sizes considering model architecture
resources_per_graph = num_hidden_layers + 1
if vllm_config.speculative_config is not None:
draft_model_hf_config = vllm_config.speculative_config.draft_model_config.hf_config
resources_per_graph += draft_model_hf_config.num_hidden_layers + 1
# For suffix decoding, use the suffix path when no draft_model_config is provided.
if (spec := vllm_config.speculative_config) and \
(draft := spec.draft_model_config):
resources_per_graph += draft.hf_config.num_hidden_layers + 1
Comment thread
Potabk marked this conversation as resolved.

# TODO: Find out whether we need to take into account the pp_size
num_comm_groups = sum(size > 1 for size in [
Expand Down
Loading
Loading