Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=v0.19.0
ARG VLLM_COMMIT=5af684c31912232e5c89484c2e8259e0fac6c55b
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [v0.19.0]
vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: v0.19.0
vllm: 5af684c31912232e5c89484c2e8259e0fac6c55b
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
Expand Down Expand Up @@ -91,7 +91,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [v0.19.0]
vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -103,7 +103,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [v0.19.0]
vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand All @@ -116,4 +116,4 @@ jobs:
type: light
secrets:
HW_OBS_AK: ${{ secrets.HW_OBS_AK }}
HW_OBS_SK: ${{ secrets.HW_OBS_SK }}
HW_OBS_SK: ${{ secrets.HW_OBS_SK }}
6 changes: 2 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_COMMIT=v0.19.0
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.19.0
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
6 changes: 2 additions & 4 deletions Dockerfile.310p
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_COMMIT=v0.19.0
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.19.0
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
6 changes: 2 additions & 4 deletions Dockerfile.310p.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_COMMIT=v0.19.0
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.19.0
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
6 changes: 2 additions & 4 deletions Dockerfile.a3
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_COMMIT=v0.19.0
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.19.0
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
6 changes: 2 additions & 4 deletions Dockerfile.a3.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_COMMIT=v0.19.0
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.19.0
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
6 changes: 2 additions & 4 deletions Dockerfile.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_COMMIT=v0.19.0
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.19.0
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
12 changes: 12 additions & 0 deletions tests/ut/patch/worker/patch_common/test_patch_gdn_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,23 @@

import vllm_ascend.patch.worker.patch_gdn_attn as patch_gdn_attn
from vllm.config.compilation import CUDAGraphMode
from vllm.model_executor.layers.fla.ops import index as _fla_index
from vllm.v1.attention.backend import CommonAttentionMetadata
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
from vllm.v1.kv_cache_interface import MambaSpec


@pytest.fixture(autouse=True)
def _patch_triton_cdiv(monkeypatch):
if not hasattr(_fla_index.triton, "cdiv"):
monkeypatch.setattr(
_fla_index.triton,
"cdiv",
lambda a, b: (a + b - 1) // b,
raising=False,
)


@dataclass
class BatchSpec:
seq_lens: list[int]
Expand Down
62 changes: 39 additions & 23 deletions vllm_ascend/_310p/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input
from vllm_ascend.quantization.quant_type import QuantType
from vllm_ascend.utils import vllm_version_is

from .experts_selector import select_experts
from .moe_comm_method import AllGatherCommImpl310
Expand All @@ -36,6 +37,10 @@ class AscendUnquantizedFusedMoEMethod310(UnquantizedFusedMoEMethod):
def __init__(self, moe: FusedMoEConfig = None):
super().__init__(moe=moe)

@property
def is_monolithic(self) -> bool:
return False

def process_weights_after_loading(self, layer):
super().process_weights_after_loading(layer)

Expand Down Expand Up @@ -156,21 +161,20 @@ def __init__(self, *args, **kwargs):
self.quant_type = self.get_quant_type()

_MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
self.runner = self._init_runner()

def _init_runner(self):
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner

return AscendMoERunner(
layer=self,
moe_config=self.moe_config,
router=self.router,
routed_input_transform=self._routed_input_transform,
gate=self.gate,
shared_experts=self.shared_experts,
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
is_legacy = vllm_version_is("0.19.0")
self.runner = AscendMoERunner(
self if is_legacy else self.layer_name,
self.moe_config,
self.router,
self._routed_input_transform,
self.gate if is_legacy else kwargs.pop("gate", None),
self.shared_experts if is_legacy else kwargs.pop("shared_experts", None),
self.quant_method,
self.reduce_results,
self.vllm_config.parallel_config.enable_dbo,
)

def init_experts_map(self, moe_config):
Expand Down Expand Up @@ -276,7 +280,23 @@ def __init__(
self._gate = gate
# Recreate runner after shared_experts/gate are set so custom op dispatch
# goes through moe_forward_shared.
self.runner = self._init_runner()
# NOTE: must use self._shared_experts here, not self.shared_experts —
# FusedMoE.shared_experts is a property that reads self.runner.shared_experts,
# which at this point is still the stale runner built with shared_experts=None.
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner

is_legacy = vllm_version_is("0.19.0")
self.runner = AscendMoERunner(
self if is_legacy else self.layer_name,
self.moe_config,
self.router,
self._routed_input_transform,
self.gate,
self._shared_experts,
self.quant_method,
self.reduce_results,
self.vllm_config.parallel_config.enable_dbo,
)

@property
def is_internal_router(self) -> bool:
Expand All @@ -288,20 +308,16 @@ def forward(
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
if self._shared_experts is None:
fused_out = AscendFusedMoE310.forward(
self,
hidden_states=hidden_states,
router_logits=router_logits,
)
shared_out = None
return shared_out, fused_out
shared_out, fused_out = AscendFusedMoE310.forward(
result = AscendFusedMoE310.forward(
self,
hidden_states=hidden_states,
router_logits=router_logits,
)
return shared_out, fused_out
# When shared experts are absent, the parent returns only fused_out;
# otherwise it returns a (shared_out, fused_out) tuple.
if self._shared_experts is None:
return None, result
return result

def _forward_shared_experts(self, hidden_states: torch.Tensor):
if self._shared_experts is None:
Expand Down
Loading
Loading