Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
max-parallel: 2
matrix:
os: [linux-arm64-npu-1, linux-arm64-npu-4]
vllm_version: [main, v0.8.5.post1]
vllm_version: [main, v0.9.0]
concurrency:
group: >
${{
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_long_term.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
strategy:
max-parallel: 2
matrix:
vllm_version: [main, v0.8.5.post1]
vllm_version: [main, v0.9.0]
name: vLLM Ascend long term test
runs-on: linux-arm64-npu-1
container:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_pd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
strategy:
matrix:
vllm_verison: [main, v0.8.5.post1]
vllm_verison: [main, v0.9.0]
name: vLLM Ascend prefilling decoding disaggregation test
runs-on: linux-arm64-npu-static-8

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.8.5.post1
ARG VLLM_TAG=v0.9.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.8.5.post1
ARG VLLM_TAG=v0.9.0

RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
Expand Down
6 changes: 3 additions & 3 deletions tests/long_term/spec_decode/e2e/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
from vllm import SamplingParams
from vllm.sequence import PromptLogprobs, SampleLogprobs

from ....model_utils import (TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs,
check_logprobs_close, check_outputs_equal)
from tests.model_utils import (TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs,
check_logprobs_close, check_outputs_equal)

PROMPTS = [
"Hello, my name is",
Expand Down
14 changes: 5 additions & 9 deletions vllm_ascend/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from vllm.v1.worker.gpu_input_batch import InputBatch

from vllm_ascend.ops.attention import vanilla_chunked_prefill
from vllm_ascend.utils import vllm_version_is


class AscendAttentionBackend(AttentionBackend):
Expand Down Expand Up @@ -142,14 +141,11 @@ def reorder_batch(self, input_batch: "InputBatch",

def build(self, num_reqs, num_actual_tokens, max_query_len,
common_prefix_len):
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
block_table = (self.runner.input_batch.block_table.
get_device_tensor()[:num_reqs])
else:
block_table = self.runner.input_batch.block_table[
0].get_device_tensor()
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
block_table[:num_reqs])

block_table = self.runner.input_batch.block_table[0].get_device_tensor(
)
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
block_table[:num_reqs])

query_lens = self.runner.query_lens
seq_lens = self.runner.seq_lens_cpu[:num_reqs]
Expand Down
14 changes: 5 additions & 9 deletions vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner

if TYPE_CHECKING:
Expand Down Expand Up @@ -239,14 +238,11 @@ def build(self,
# function. We should avoid GPU -> CPU sync as much as possible because
# it blocks on all previous kernels.
device = self.runner.device
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
block_table = (self.runner.input_batch.block_table.
get_device_tensor()[:num_reqs])
else:
block_table = self.runner.input_batch.block_table[
0].get_device_tensor()
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
block_table[:num_reqs])

block_table = self.runner.input_batch.block_table[0].get_device_tensor(
)
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
block_table[:num_reqs])
slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
device, non_blocking=True)
input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
Expand Down
108 changes: 38 additions & 70 deletions vllm_ascend/ops/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,10 @@
tensor_model_parallel_all_reduce)
from vllm.distributed.parallel_state import get_dp_group
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)

from vllm_ascend.utils import vllm_version_is

if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")):
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoEParallelConfig, MoEConfig)
else:
MoEConfig = None

from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
FusedMoE, FusedMoEParallelConfig, MoEConfig, UnquantizedFusedMoEMethod,
determine_expert_map)
from vllm.model_executor.layers.quantization.base_config import \
QuantizationConfig

import vllm_ascend.envs as envs_ascend
from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
Expand Down Expand Up @@ -587,10 +579,8 @@ def select_experts(
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):

def __init__(self, moe: MoEConfig = None):
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
super().__init__()
else:
super().__init__(moe=moe)

super().__init__(moe=moe)
vllm_config = get_current_vllm_config()

ep_group = get_ep_group()
Expand Down Expand Up @@ -731,24 +721,17 @@ def __init__(
params_dtype = torch.get_default_dtype()

vllm_config = get_current_vllm_config()
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
self.ep_size = get_ep_group().world_size
self.tp_size = get_etp_group().world_size
self.dp_size = (dp_size if dp_size is not None else
get_dp_group().world_size)
self.dp_rank = (0 if self.dp_size == 1 else
get_dp_group().rank_in_group)
else:
self.moe_parallel_config: FusedMoEParallelConfig = (
FusedMoEParallelConfig.make(
tp_size_=(tp_size if tp_size is not None else
get_tensor_model_parallel_world_size()),
dp_size_=(dp_size if dp_size is not None else
get_dp_group().world_size),
vllm_parallel_config=vllm_config.parallel_config))

self.moe_parallel_config.ep_size = get_ep_group().world_size
self.moe_parallel_config.tp_size = get_etp_group().world_size
self.moe_parallel_config: FusedMoEParallelConfig = (
FusedMoEParallelConfig.make(
tp_size_=(tp_size if tp_size is not None else
get_tensor_model_parallel_world_size()),
dp_size_=(dp_size if dp_size is not None else
get_dp_group().world_size),
vllm_parallel_config=vllm_config.parallel_config))

self.moe_parallel_config.ep_size = get_ep_group().world_size
self.moe_parallel_config.tp_size = get_etp_group().world_size

self.top_k = top_k
self.num_experts = num_experts
Expand All @@ -773,54 +756,39 @@ def __init__(
self.local_num_experts, self.expert_map = determine_expert_map(
self.ep_size,
get_ep_group().rank_in_group, self.global_num_experts)
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
self.tp_rank = get_etp_group().rank_in_group
self.ep_rank = get_ep_group().rank_in_group
else:
self.moe_parallel_config.tp_rank = get_etp_group(
).rank_in_group
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group

self.moe_parallel_config.tp_rank = get_etp_group().rank_in_group
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group

else:
# Adjust TP size for DP attention
# haven't test its functionality yet, may remove in the future
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
self.tp_rank = self.tp_size * self.dp_rank
self.ep_rank = 0
self.tp_size = self.tp_size * self.dp_size
self.ep_size = 1
else:
self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
self.moe_parallel_config.ep_rank = 0
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
self.moe_parallel_config.ep_size = 1

self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
self.moe_parallel_config.ep_rank = 0
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
self.moe_parallel_config.ep_size = 1

self.local_num_experts, self.expert_map = (self.global_num_experts,
None)
if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for "
"non-grouped topk.")
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
if quant_config is None:
self.quant_method: Optional[QuantizeMethodBase] = (
AscendUnquantizedFusedMoEMethod())
else:
self.quant_method = quant_config.get_quant_method(self, prefix)
else:
moe = MoEConfig(
num_experts=self.global_num_experts,
experts_per_token=top_k,
hidden_dim=hidden_size,
num_local_experts=self.local_num_experts,
moe_parallel_config=self.moe_parallel_config,
# TODO (bnell): this needs to be fixed for quantized types.
in_dtype=params_dtype,
)

if quant_config is None:
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
else:
self.quant_method = quant_config.get_quant_method(self, prefix)
moe = MoEConfig(
num_experts=self.global_num_experts,
experts_per_token=top_k,
hidden_dim=hidden_size,
num_local_experts=self.local_num_experts,
moe_parallel_config=self.moe_parallel_config,
# TODO (bnell): this needs to be fixed for quantized types.
in_dtype=params_dtype,
)

if quant_config is None:
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
else:
self.quant_method = quant_config.get_quant_method(self, prefix)

assert self.quant_method is not None

Expand Down
11 changes: 2 additions & 9 deletions vllm_ascend/patch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,9 @@
# each worker's `__init__` function.
#
# Then in each kind of patch, there are three folders:
# - patch_0_8_5: contains the patches applied when vllm version is 0.8.5.
# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0.
# - patch_main: contains the patches applied when vllm version is main branch.
# - patch_common: contains the patches applied in both 0.8.5 and main branch.
#
# In the future, with the vllm version upgrade, the new patch folder such as
# patch_0_8_5, patch_0_8_6, etc. will be added to manage the patch for different
# vllm version. And the patch_common will contain the patches applied in all the
# vllm version.
# Once the vllm version is too old that vllm-ascend will not support, the related
# patch folder will be removed as well.
# - patch_common: contains the patches applied in both 0.9.0 and main branch.
#
# Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
# ----------------------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/patch/platform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from vllm_ascend.utils import vllm_version_is

# Import specific patches for different versions
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
from vllm_ascend.patch.platform import patch_0_8_5 # noqa: F401
if vllm_version_is("0.9.0"):
from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401
from vllm_ascend.patch.platform import patch_common # noqa: F401
else:
from vllm_ascend.patch.platform import patch_common # noqa: F401
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/patch/worker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
from vllm_ascend.utils import vllm_version_is

# Import specific patches for different versions
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
from vllm_ascend.patch.worker import patch_0_8_5 # noqa: F401
if vllm_version_is("0.9.0"):
from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401
from vllm_ascend.patch.worker import patch_common # noqa: F401
else:
from vllm_ascend.patch.worker import patch_common # noqa: F401
Expand Down
17 changes: 5 additions & 12 deletions vllm_ascend/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@
_init_attn_metadata_from_tensor_dict,
_init_sampling_metadata_from_tensor_dict)

from vllm_ascend.utils import vllm_version_is

if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionBackend

Expand Down Expand Up @@ -1017,10 +1015,8 @@ def save_sharded_state(
pattern: Optional[str] = None,
max_size: Optional[int] = None,
) -> None:
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
from vllm.model_executor.model_loader.loader import ShardedStateLoader # type: ignore[import] # isort: skip # noqa
else:
from vllm.model_executor.model_loader import ShardedStateLoader

from vllm.model_executor.model_loader import ShardedStateLoader
ShardedStateLoader.save_model(
self.model,
path,
Expand All @@ -1032,12 +1028,9 @@ def save_tensorized_model(
self,
tensorizer_config: TensorizerConfig,
) -> None:
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
from vllm.model_executor.model_loader.loader import \
TensorizerLoader # type: ignore # noqa
else:
from vllm.model_executor.model_loader import \
TensorizerLoader # type: ignore # noqa

from vllm.model_executor.model_loader import \
TensorizerLoader # type: ignore # noqa
TensorizerLoader.save_model(
self.model,
tensorizer_config=tensorizer_config,
Expand Down
Loading
Loading