vllm-project · MengqingCao · Jan 7, 2026 · Dec 29, 2025 · Jan 4, 2026 · Jan 5, 2026
@@ -32,6 +32,7 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig
 
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.distributed.mooncake_connector import GET_META_MSG
 from vllm_ascend.distributed.mooncake_transfer_engine import global_te
 from vllm_ascend.distributed.utils import (align_memory,
                                            get_transfer_timeout_value,
@@ -44,7 +45,6 @@
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
-GET_META_MSG = b"get_meta_msg"
 DONE_SENDING_MSG = b"done_sending_msg"
 
 

@@ -13,8 +13,7 @@
 import torch.nn.functional as F
 import triton
 import triton.language as tl
-
-PAD_SLOT_ID = -1
+from vllm.attention.backends.utils import PAD_SLOT_ID
 
 
 def causal_conv1d_ref(

@@ -20,7 +20,6 @@
 
 # aclgraph does not support shift operator for now
 # TODO: revert me when aclgraph supports shift operator
-TOKEN_TYPE_SHIFT = 30
 TOKEN_TYPE_MULTIPLIER = 1 << 30
 TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1
 

@@ -4,7 +4,8 @@
 import torch
 from vllm.triton_utils import HAS_TRITON, triton
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.rejection_sampler import (GREEDY_TEMPERATURE,
+from vllm.v1.sample.rejection_sampler import (GREEDY_TEMPERATURE, MAX_SPEC_LEN,
+                                              PLACEHOLDER_TOKEN_ID,
                                               generate_uniform_probs)
 
 from vllm_ascend.ops.triton.reject_sample import (
@@ -13,11 +14,6 @@
     sample_recovered_tokens_kernel)
 from vllm_ascend.sample.sampler import apply_top_k_top_p
 
-PLACEHOLDER_TOKEN_ID = -1
-# Maximum number of speculative draft tokens allowed per request in a single
-# step. This value is chosen to be large enough to handle typical use cases.
-MAX_SPEC_LEN = 32
-
 
 def apply_sampling_constraints(
     logits: torch.Tensor,  # [num_tokens, vocab_size]

@@ -21,6 +21,7 @@
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID
 from vllm.v1.spec_decode.eagle import EagleProposer as VllmEagleProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -40,8 +41,6 @@
 from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
 from vllm_ascend.utils import shared_expert_dp_enabled
 
-PADDING_SLOT_ID = -1
-
 # Currently we will fix block size to a small one since `num_reqs` can't be too large
 _PREPARE_INPUTS_BLOCK_SIZE = 4
 

@@ -9,6 +9,7 @@
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID
 
 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
@@ -18,8 +19,6 @@
 from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
 from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
 
-PADDING_SLOT_ID = -1
-
 
 class MtpProposer(EagleProposer):
 

@@ -65,6 +65,7 @@
                              LogprobsLists, LogprobsTensors, ModelRunnerOutput,
                              SamplerOutput,
                              make_empty_encoder_model_runner_output)
+from vllm.v1.sample.logits_processor import build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
@@ -99,7 +100,6 @@
 from vllm_ascend.eplb.utils import model_register
 from vllm_ascend.ops.rotary_embedding import set_cos_and_sin, update_cos_sin
 from vllm_ascend.patch.worker.patch_module import patch_torch_npu_argsort
-from vllm_ascend.sample.logits_processor import build_logitsprocs
 from vllm_ascend.sample.sampler import AscendSampler
 from vllm_ascend.spec_decode import get_spec_decode_method
 from vllm_ascend.spec_decode.eagle_proposer import EagleProposer