Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions vllm/model_executor/models/bailing_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
Expand Down Expand Up @@ -485,7 +484,6 @@ def __init__(
else:
self.lm_head = PPMissingLayer()

self.sampler = get_sampler()
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

Expand All @@ -512,14 +510,6 @@ def compute_logits(
sampling_metadata)
return logits

def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens

def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(
Expand Down
2 changes: 0 additions & 2 deletions vllm/model_executor/models/granite_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
Expand Down Expand Up @@ -549,7 +548,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str):
self.config = config
self.quant_config = quant_config
self.cache_config = cache_config
self.sampler = get_sampler()

# The language model is typically a Granite LLM
self.language_model = init_vllm_registered_model(
Expand Down
10 changes: 0 additions & 10 deletions vllm/model_executor/models/hunyuan_v1_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import (
Expand Down Expand Up @@ -661,7 +660,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
config.vocab_size,
logit_scale)
self.sampler = get_sampler()
else:
self.lm_head = PPMissingLayer()

Expand All @@ -685,14 +683,6 @@ def compute_logits(
sampling_metadata)
return logits

def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens

def make_empty_intermediate_tensors(
self, batch_size: int, dtype: torch.dtype,
device: torch.device) -> IntermediateTensors:
Expand Down
2 changes: 0 additions & 2 deletions vllm/model_executor/models/mimo.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from vllm.distributed import get_pp_group
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader, maybe_remap_kv_scale_name)
Expand Down Expand Up @@ -176,7 +175,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.lm_head = PPMissingLayer()

self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = get_sampler()

self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)
Expand Down
11 changes: 0 additions & 11 deletions vllm/model_executor/models/mimo_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
Expand Down Expand Up @@ -161,8 +160,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.lm_head = ParallelLMHead(self.config.vocab_size,
self.config.hidden_size)

self.sampler = get_sampler()

def forward(
self,
input_ids: torch.Tensor,
Expand All @@ -187,14 +184,6 @@ def compute_logits(
return self.model.compute_logits(hidden_states, self.lm_head,
sampling_metadata, spec_step_idx)

def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens

def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
stacked_params_mapping = [
Expand Down
10 changes: 0 additions & 10 deletions vllm/model_executor/models/phi4flash.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
causal_conv1d_fn, causal_conv1d_update)
from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
selective_scan_fn, selective_state_update)
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
Expand Down Expand Up @@ -641,7 +640,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
config.vocab_size,
logits_as_input=False)
self.sampler = get_sampler()

def forward(
self,
Expand Down Expand Up @@ -709,14 +707,6 @@ def compute_logits(
prune_hidden_states=prune_hidden_states)
return processed_logits

def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens

def load_weights(
self,
weights: Iterable[tuple[str, torch.Tensor]],
Expand Down