Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 0 additions & 40 deletions tests/ut/sample/logits_processor/test_builtin.py

This file was deleted.

2 changes: 1 addition & 1 deletion vllm_ascend/distributed/mooncake_layerwise_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from vllm.v1.kv_cache_interface import KVCacheConfig

from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.mooncake_connector import GET_META_MSG
from vllm_ascend.distributed.mooncake_transfer_engine import global_te
from vllm_ascend.distributed.utils import (align_memory,
get_transfer_timeout_value,
Expand All @@ -44,7 +45,6 @@
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.request import Request

GET_META_MSG = b"get_meta_msg"
DONE_SENDING_MSG = b"done_sending_msg"


Expand Down
3 changes: 1 addition & 2 deletions vllm_ascend/ops/triton/mamba/causal_conv1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
import torch.nn.functional as F
import triton
import triton.language as tl

PAD_SLOT_ID = -1
from vllm.attention.backends.utils import PAD_SLOT_ID


def causal_conv1d_ref(
Expand Down
1 change: 0 additions & 1 deletion vllm_ascend/patch/worker/patch_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

# aclgraph does not support shift operator for now
# TODO: revert me when aclgraph supports shift operator
TOKEN_TYPE_SHIFT = 30
TOKEN_TYPE_MULTIPLIER = 1 << 30
TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1

Expand Down
50 changes: 0 additions & 50 deletions vllm_ascend/sample/logits_processor/__init__.py

This file was deleted.

52 changes: 0 additions & 52 deletions vllm_ascend/sample/logits_processor/builtin.py

This file was deleted.

8 changes: 2 additions & 6 deletions vllm_ascend/sample/rejection_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import torch
from vllm.triton_utils import HAS_TRITON, triton
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import (GREEDY_TEMPERATURE,
from vllm.v1.sample.rejection_sampler import (GREEDY_TEMPERATURE, MAX_SPEC_LEN,
PLACEHOLDER_TOKEN_ID,
generate_uniform_probs)

from vllm_ascend.ops.triton.reject_sample import (
Expand All @@ -13,11 +14,6 @@
sample_recovered_tokens_kernel)
from vllm_ascend.sample.sampler import apply_top_k_top_p

PLACEHOLDER_TOKEN_ID = -1
# Maximum number of speculative draft tokens allowed per request in a single
# step. This value is chosen to be large enough to handle typical use cases.
MAX_SPEC_LEN = 32


def apply_sampling_constraints(
logits: torch.Tensor, # [num_tokens, vocab_size]
Expand Down
3 changes: 1 addition & 2 deletions vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID
from vllm.v1.spec_decode.eagle import EagleProposer as VllmEagleProposer
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
Expand All @@ -40,8 +41,6 @@
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
from vllm_ascend.utils import shared_expert_dp_enabled

PADDING_SLOT_ID = -1

# Currently we will fix block size to a small one since `num_reqs` can't be too large
_PREPARE_INPUTS_BLOCK_SIZE = 4

Expand Down
3 changes: 1 addition & 2 deletions vllm_ascend/spec_decode/mtp_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID

from vllm_ascend.ascend_forward_context import set_ascend_forward_context
from vllm_ascend.attention.attention_v1 import AscendAttentionState
Expand All @@ -18,8 +19,6 @@
from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable

PADDING_SLOT_ID = -1


class MtpProposer(EagleProposer):

Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
LogprobsLists, LogprobsTensors, ModelRunnerOutput,
SamplerOutput,
make_empty_encoder_model_runner_output)
from vllm.v1.sample.logits_processor import build_logitsprocs
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import RejectionSampler
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
Expand Down Expand Up @@ -99,7 +100,6 @@
from vllm_ascend.eplb.utils import model_register
from vllm_ascend.ops.rotary_embedding import set_cos_and_sin, update_cos_sin
from vllm_ascend.patch.worker.patch_module import patch_torch_npu_argsort
from vllm_ascend.sample.logits_processor import build_logitsprocs
from vllm_ascend.sample.sampler import AscendSampler
from vllm_ascend.spec_decode import get_spec_decode_method
from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
Expand Down
Loading