From d584121f7c6ea0b180d3e53087d287b320b2f481 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Wed, 12 Nov 2025 05:40:54 +0000 Subject: [PATCH 01/33] merge main Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/config/model.py | 2 + vllm/config/vllm.py | 1 + vllm/engine/arg_utils.py | 4 + vllm/entrypoints/llm.py | 3 + vllm/model_executor/layers/fused_moe/layer.py | 16 ++ .../fused_moe/routed_experts_capturer.py | 215 ++++++++++++++++++ vllm/outputs.py | 3 + vllm/v1/core/sched/scheduler.py | 35 ++- vllm/v1/engine/__init__.py | 4 +- vllm/v1/engine/output_processor.py | 9 +- vllm/v1/worker/gpu_model_runner.py | 28 +++ 11 files changed, 315 insertions(+), 5 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/routed_experts_capturer.py diff --git a/vllm/config/model.py b/vllm/config/model.py index 249fb5668b4f..da3a4618c089 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -198,6 +198,8 @@ class ModelConfig: graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid for maximal performance and flexibility.""" + enable_return_routed_experts: bool = False + """Whether to return routed experts.""" max_logprobs: int = 20 """Maximum number of log probabilities to return when `logprobs` is specified in `SamplingParams`. The default value comes the default for the diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 9eaa55f1fdbe..124ae6416866 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1352,6 +1352,7 @@ def __str__(self): f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa f"quantization={self.model_config.quantization}, " f"enforce_eager={self.model_config.enforce_eager}, " + f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " f"kv_cache_dtype={self.cache_config.cache_dtype}, " f"device_config={self.device_config.device}, " f"structured_outputs_config={self.structured_outputs_config!r}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 94608b13dfd4..98681c8e2413 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -354,6 +354,7 @@ class EngineArgs: """Arguments for vLLM engine.""" model: str = ModelConfig.model + enable_return_routed_experts: bool = ModelConfig.enable_return_routed_experts model_weights: str = ModelConfig.model_weights served_model_name: str | list[str] | None = ModelConfig.served_model_name tokenizer: str | None = ModelConfig.tokenizer @@ -453,6 +454,7 @@ class EngineArgs: quantization: QuantizationMethods | None = ModelConfig.quantization allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization enforce_eager: bool = ModelConfig.enforce_eager + enable_return_routed_experts: bool = ModelConfig.enable_return_routed_experts disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field( MultiModalConfig, "limit_per_prompt" @@ -657,6 +659,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **model_kwargs["allow_deprecated_quantization"], ) model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"]) + model_group.add_argument("--enable-return-routed-experts", **model_kwargs["enable_return_routed_experts"]) model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"]) model_group.add_argument( @@ -1239,6 +1242,7 @@ def create_model_config(self) -> ModelConfig: quantization=self.quantization, allow_deprecated_quantization=self.allow_deprecated_quantization, enforce_eager=self.enforce_eager, + enable_return_routed_experts=self.enable_return_routed_experts, max_logprobs=self.max_logprobs, logprobs_mode=self.logprobs_mode, disable_sliding_window=self.disable_sliding_window, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9c10e28c22de..a7aa9a569436 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -158,6 +158,7 @@ class LLM: enforce_eager: Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid. + enable_return_routed_experts: Whether to return routed experts. disable_custom_all_reduce: See [ParallelConfig][vllm.config.ParallelConfig]. hf_token: The token to use as HTTP bearer authorization for remote files @@ -209,6 +210,7 @@ def __init__( swap_space: float = 4, cpu_offload_gb: float = 0, enforce_eager: bool = False, + enable_return_routed_experts: bool = False, disable_custom_all_reduce: bool = False, hf_token: bool | str | None = None, hf_overrides: HfOverrides | None = None, @@ -317,6 +319,7 @@ def _make_config(value: Any, cls: type[_R]) -> _R: swap_space=swap_space, cpu_offload_gb=cpu_offload_gb, enforce_eager=enforce_eager, + enable_return_routed_experts=enable_return_routed_experts, disable_custom_all_reduce=disable_custom_all_reduce, hf_token=hf_token, hf_overrides=hf_overrides, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 60e8ef9f77fd..cc64b8a07b69 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -48,6 +48,9 @@ direct_register_custom_op, ) from vllm.v1.worker.ubatching import dbo_current_ubatch_id +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer +) if current_platform.is_cuda_alike(): from .fused_moe import eplb_map_to_physical_and_record @@ -435,6 +438,8 @@ def __init__( raise ValueError("Duplicate layer name: {}".format(prefix)) compilation_config.static_forward_context[prefix] = self self.layer_name = prefix + from vllm.model_executor.models.utils import extract_layer_index + self.layer_id = extract_layer_index(self.layer_name) self.enable_eplb = enable_eplb self.expert_load_view: torch.Tensor | None = None @@ -701,6 +706,10 @@ def maybe_init_modular_kernel(self) -> None: def shared_experts(self) -> torch.nn.Module | None: return None + @property + def get_layer_id(self): + return self.layer_id + @property def gate(self) -> torch.nn.Module | None: return None @@ -1650,6 +1659,13 @@ def valid_grouping() -> bool: assert topk_ids.dtype == indices_type or indices_type is None + capturer = RoutedExpertsCapturer.get_instance() + if capturer is not None: + capturer.capture( # noqa + layer_id=self.layer_id, + topk_ids=topk_ids, + ) + return topk_weights, topk_ids def must_reduce_shared_expert_outputs(self) -> bool: diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py new file mode 100644 index 000000000000..551f2007071b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -0,0 +1,215 @@ +import logging +from abc import ABC +import torch +from vllm.config import ModelConfig +from multiprocessing import shared_memory +import numpy as np +import fcntl +from unittest.mock import patch +logger = logging.getLogger(__name__) + +LOCK_FILE = "/tmp/vllm_routed_experts.lock" # Shared lock file path + +def lock_file(fp): + fcntl.flock(fp, fcntl.LOCK_EX) + +def unlock_file(fp): + fcntl.flock(fp, fcntl.LOCK_UN) + +_global_experts_capturer = None + +class RoutedExpertsCapturer(ABC): + @staticmethod + def create(enable: bool): + """Create a global singleton instance""" + global _global_experts_capturer + if _global_experts_capturer is not None: + raise RuntimeError("Experts capturer already created.") + + if enable: + _global_experts_capturer = _RoutedExpertsCapturerReal() + else: + _global_experts_capturer = _RoutedExpertsCapturerNoop() + return _global_experts_capturer + + @staticmethod + def get_instance(): + if _global_experts_capturer is None: + logger.info("Experts capturer not initialized.") + return _global_experts_capturer + + def init_buffer(self, max_num_batched_tokens: int, max_num_kv_tokens: int, model_config: ModelConfig): + raise NotImplementedError + + def capture(self, layer_id: int, topk_ids: torch.Tensor): + raise NotImplementedError + + def clear_buffer(self): + raise NotImplementedError + + def save_captured_experts(self, indices: np.ndarray): + raise NotImplementedError + + +class _RoutedExpertsCapturerReal(RoutedExpertsCapturer): + """Capturer for routed experts with host buffer""" + def __init__(self): + self._experts_capturer_device_buffer = None + + def init_buffer(self, max_num_batched_tokens: int, max_num_kv_tokens: int, model_config: ModelConfig, enable_shared_memory: bool): + if ( + model_config.enable_return_routed_experts + and self._experts_capturer_device_buffer is None + ): + self._experts_capturer_device_buffer = torch.zeros( + ( + max_num_batched_tokens, + model_config.hf_text_config.num_hidden_layers, + model_config.hf_text_config.num_experts_per_tok, + ), + dtype=torch.int32, + device="cuda", + ) + + if enable_shared_memory: + # Compute required shared memory size + shape = ( + max_num_kv_tokens, + model_config.hf_text_config.num_hidden_layers, + model_config.hf_text_config.num_experts_per_tok, + ) + nbytes = np.prod(shape) * np.dtype(np.int32).itemsize + + # 创建共享内存 + with open(LOCK_FILE, "wb") as lockfp: + self._shm = shared_memory.SharedMemory( + create=True, + size=nbytes, + name="vllm_routed_experts_buffer" # Fixed name for worker access + ) + + # 创建 numpy array 视图 + self._host_buffer_view = np.ndarray( + shape, dtype=np.int32, buffer=self._shm.buf + ) + self._host_buffer_view.fill(0) + + logger.debug( + f"Created shared memory buffer '{self._shm.name}' " + f"with shape {shape}" + ) + else: + self._shm = None + self._host_buffer_view = None + + def capture(self, layer_id: int, topk_ids: torch.Tensor): + if self._experts_capturer_device_buffer is None: + raise RuntimeError("Buffer not initialized.") + batch_size, num_routed_experts = topk_ids.shape + self._experts_capturer_device_buffer[:batch_size, layer_id, : ] = topk_ids + + def clear_buffer(self): + self._experts_capturer_device_buffer.zero_() + + + def save_captured_experts(self, indices: np.ndarray): + # Copy the entire batch from GPU to shared memory (via numpy view) + with open(LOCK_FILE, "wb+") as fp: + if self._host_buffer_view is not None: + num_tokens = len(indices) + data = self._experts_capturer_device_buffer[:num_tokens, :, :].cpu().numpy() + self._host_buffer_view[indices, :, :] = data + + def __del__(self): + """Clean up shared memory""" + if self._shm is not None: + self._shm.close() + self._shm.unlink() # Delete shared memory + + + +class _RoutedExpertsCapturerNoop(RoutedExpertsCapturer): + def init_buffer(self, max_num_batched_tokens: int, max_num_kv_tokens: int, model_config: ModelConfig, enable_shared_memory: bool): + pass + + def capture(self, layer_id: int, topk_ids: torch.Tensor): + pass + + def clear_buffer(self): + pass + + def save_captured_experts(self, indices: np.ndarray): + pass + + +_global_experts_reader = None + +class RoutedExpertsReader(ABC): + @staticmethod + def create(enable: bool): + """Create a global singleton instance""" + global _global_experts_reader + if _global_experts_reader is not None: + raise RuntimeError("Experts Reader already created.") + + if enable: + _global_experts_reader = _RoutedExpertsReaderReal() + else: + _global_experts_reader = _RoutedExpertsReaderNoop() + return _global_experts_reader + + @staticmethod + def get_instance(): + if _global_experts_reader is None: + logger.info("Experts reader not initialized.") + # raise RuntimeError("Experts reader not initialized.") + return _global_experts_reader + + def get_routed_experts(self,num_tokens: int): + raise NotImplementedError + + +class _RoutedExpertsReaderReal: + """Reader class in worker process""" + def __init__(self): + self._shm = None + + def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig): + if self._shm is None: + shape = ( + max_num_kv_tokens, + model_config.hf_text_config.num_hidden_layers, + model_config.hf_text_config.num_experts_per_tok, + ) + + # Attach to existing shared memory + with open(LOCK_FILE, "rb+") as fp: + with patch( + "multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None, + ): + self._shm = shared_memory.SharedMemory( + name="vllm_routed_experts_buffer" + ) + + self._host_buffer_view = np.ndarray( + shape, dtype=np.int32, buffer=self._shm.buf + ) + + def get_routed_experts(self, indices: np.ndarray): + """Read data from shared memory, return routed experts for the corresponding request""" + with open(LOCK_FILE, "rb+") as fp: + if self._host_buffer_view is None: + raise RuntimeError("Buffer not attached.") + return self._host_buffer_view[indices, :, :].copy() + + def __del__(self): + """Only close, do not delete shared memory""" + if self._shm is not None: + self._shm.close() # Note: reader does not call unlink() + +class _RoutedExpertsReaderNoop: + def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig): + pass + def get_routed_experts(self, indices: np.ndarray): + return None \ No newline at end of file diff --git a/vllm/outputs.py b/vllm/outputs.py index 74e534ef0c07..f28b1c067a2f 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -7,6 +7,7 @@ from typing import Any, Generic import torch +import numpy as np from typing_extensions import TypeVar from vllm.logger import init_logger @@ -40,6 +41,7 @@ class CompletionOutput: index: int text: str token_ids: GenericSequence[int] + routed_experts: np.ndarray | None # [seq_len,layer_num,topk] cumulative_logprob: float | None logprobs: SampleLogprobs | None finish_reason: str | None = None @@ -54,6 +56,7 @@ def __repr__(self) -> str: f"CompletionOutput(index={self.index}, " f"text={self.text!r}, " f"token_ids={self.token_ids}, " + f"routed_experts={self.routed_experts}, " f"cumulative_logprob={self.cumulative_logprob}, " f"logprobs={self.logprobs}, " f"finish_reason={self.finish_reason}, " diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 9a2a589cfae4..7b3a90174ce9 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1,5 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +import numpy as np +import json +import os + import itertools import time from collections import defaultdict @@ -53,7 +58,7 @@ from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.utils import record_function_or_nullcontext - +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsReader logger = init_logger(__name__) @@ -224,6 +229,10 @@ def __init__( if self.log_stats and vllm_config.observability_config.enable_mfu_metrics: self.perf_metrics = ModelMetrics(vllm_config) + self.max_num_kv_tokens = ((kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups)) + 1)* self.block_size + self.routed_experts_reader = RoutedExpertsReader.create(enable=self.vllm_config.model_config.enable_return_routed_experts) + self.routed_experts_reader.attach_buffer(max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.vllm_config.model_config) + def schedule(self) -> SchedulerOutput: # NOTE(woosuk) on the scheduling algorithm: # There's no "decoding phase" nor "prefill phase" in the scheduler. @@ -1162,7 +1171,30 @@ def update_from_output( request.status = RequestStatus.FINISHED_STOPPED stopped = True + routed_experts = None if stopped: + if self.vllm_config.model_config.enable_return_routed_experts: + assert len(self.kv_cache_config.kv_cache_groups) == 1 + + kv_blocks = self.kv_cache_manager.get_blocks(request.request_id) + block_ids = kv_blocks.get_block_ids()[0] + num_tokens = request.num_tokens-1 + + # 计算slot mapping + block_ids_array = np.array(block_ids, dtype=np.int32) + num_blocks = len(block_ids) + block_size = self.block_size + + # 生成block内偏移 + block_offsets = np.arange(0, block_size) + + # 计算slot mapping: slot = block_id * block_size + offset + slot_mapping = ( + block_offsets.reshape((1, block_size)) + + block_ids_array.reshape((num_blocks, 1)) * block_size + ).flatten()[:num_tokens] + + routed_experts = self.routed_experts_reader.get_routed_experts(slot_mapping) kv_transfer_params = self._free_request(request) if status_before_stop == RequestStatus.RUNNING: stopped_running_reqs.add(request) @@ -1209,6 +1241,7 @@ def update_from_output( kv_transfer_params=kv_transfer_params, trace_headers=request.trace_headers, num_cached_tokens=request.num_cached_tokens, + routed_experts=routed_experts, num_nans_in_logits=request.num_nans_in_logits, ) ) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 27d34f1c60da..9410c9a71ec6 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -8,7 +8,7 @@ import msgspec import torch - +import numpy as np from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams @@ -139,7 +139,7 @@ class EngineCoreOutput( trace_headers: Mapping[str, str] | None = None # The number of tokens with prefix cache hits. num_cached_tokens: int = 0 - + routed_experts: np.ndarray | None = None # The number of NaNs in logits. # A value greater than 0 indicates that the output is corrupted. num_nans_in_logits: int = 0 diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 7f762bcbb072..61ccbb6d3d1a 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -8,7 +8,7 @@ from typing import Any, cast import torch - +import numpy as np from vllm.lora.request import LoRARequest from vllm.outputs import ( CompletionOutput, @@ -213,6 +213,7 @@ def make_request_output( finish_reason: FinishReason | None, stop_reason: int | str | None, kv_transfer_params: dict[str, Any] | None = None, + routed_experts: np.ndarray | None = None, ) -> RequestOutput | PoolingRequestOutput | None: finished = finish_reason is not None final_only = self.output_kind == RequestOutputKind.FINAL_ONLY @@ -264,7 +265,7 @@ def make_request_output( external_req_id = self.parent_req.external_req_id return self._new_request_output( - external_req_id, outputs, finished, kv_transfer_params + external_req_id, outputs, finished, kv_transfer_params,routed_experts ) def _new_request_output( @@ -316,6 +317,7 @@ def _new_completion_output( token_ids: list[int], finish_reason: FinishReason | None, stop_reason: int | str | None, + routed_experts: np.ndarray | None = None, ) -> CompletionOutput: assert self.detokenizer is not None assert self.logprobs_processor is not None @@ -336,6 +338,7 @@ def _new_completion_output( index=self.request_index, text=text, token_ids=token_ids, + routed_experts=routed_experts, logprobs=logprobs, cumulative_logprob=self.logprobs_processor.cumulative_logprob, finish_reason=str(finish_reason) if finished else None, @@ -527,6 +530,7 @@ def process_outputs( finish_reason = engine_core_output.finish_reason stop_reason = engine_core_output.stop_reason kv_transfer_params = engine_core_output.kv_transfer_params + routed_experts = engine_core_output.routed_experts req_state.num_cached_tokens = engine_core_output.num_cached_tokens req_state.is_prefilling = False @@ -552,6 +556,7 @@ def process_outputs( finish_reason, stop_reason, kv_transfer_params, + routed_experts ): if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 476a89bb7b25..7dfce7c87b2d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -173,6 +173,9 @@ bind_kv_cache, sanity_check_mm_encoder_outputs, ) +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer) +from vllm.distributed import get_tensor_model_parallel_rank if TYPE_CHECKING: from vllm.model_executor.model_loader.tensorizer import TensorizerConfig @@ -1625,6 +1628,10 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded) slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded] + if self.model_config.enable_return_routed_experts: + assert len(self.kv_cache_config.kv_cache_groups) == 1 + self.slot_mapping = slot_mapping.cpu() + # Fill unused with -1. Needed for reshape_and_cache in full cuda # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID slot_mapping[num_tokens:num_tokens_padded].fill_(-1) @@ -3112,6 +3119,8 @@ def execute_model( "after execute_model() returns None." ) + RoutedExpertsCapturer.get_instance().clear_buffer() + if scheduler_output.preempted_req_ids and has_kv_transfer_group(): get_kv_transfer_group().handle_preemptions( scheduler_output.preempted_req_ids @@ -3485,6 +3494,9 @@ def propose_draft_token_ids(sampled_token_ids): self.eplb_step() with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"): + if self.model_config.enable_return_routed_experts and get_tensor_model_parallel_rank() == 0: + RoutedExpertsCapturer.get_instance().save_captured_experts(indices=self.slot_mapping) + output = ModelRunnerOutput( req_ids=req_ids_output_copy, req_id_to_index=req_id_to_index_output_copy, @@ -5646,6 +5658,22 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: kv_transfer_group.register_kv_caches(kv_caches) kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks) + self.init_routed_experts_capturer() + + def init_routed_experts_capturer(self): + logger.info(f"Initializing routed experts capturer, enable_return_routed_experts: {self.model_config.enable_return_routed_experts}") + routed_experts_capturer = RoutedExpertsCapturer.create( + self.model_config.enable_return_routed_experts + ) + block_size = self.cache_config.block_size + self.max_num_kv_tokens = ((self.kv_cache_config.num_blocks // len(self.kv_cache_config.kv_cache_groups)) + 1) * block_size + routed_experts_capturer.init_buffer( + max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens, + max_num_kv_tokens=self.max_num_kv_tokens, + model_config=self.model_config, + enable_shared_memory= get_tensor_model_parallel_rank() == 0 + ) + def may_add_encoder_only_layers_to_kv_cache_config(self) -> None: """ Add encoder-only layers to the KV cache config. From bf247fd184add13dcecad12078b73710056247c6 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Fri, 7 Nov 2025 09:33:39 +0000 Subject: [PATCH 02/33] small fix Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: Jialin Ouyang Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: princepride Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: Jialin Ouyang Signed-off-by: princepride Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/config/vllm.py | 2 +- vllm/engine/arg_utils.py | 5 +- vllm/model_executor/layers/fused_moe/layer.py | 2 + .../fused_moe/routed_experts_capturer.py | 314 +++++++++++------- vllm/outputs.py | 2 +- vllm/v1/core/sched/scheduler.py | 34 +- vllm/v1/engine/__init__.py | 3 +- vllm/v1/engine/output_processor.py | 7 +- vllm/v1/worker/gpu_model_runner.py | 12 +- 9 files changed, 244 insertions(+), 137 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 124ae6416866..e08cb0cd78a4 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1352,7 +1352,7 @@ def __str__(self): f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa f"quantization={self.model_config.quantization}, " f"enforce_eager={self.model_config.enforce_eager}, " - f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " + f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " # noqa f"kv_cache_dtype={self.cache_config.cache_dtype}, " f"device_config={self.device_config.device}, " f"structured_outputs_config={self.structured_outputs_config!r}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 98681c8e2413..5d57293558d2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -454,7 +454,6 @@ class EngineArgs: quantization: QuantizationMethods | None = ModelConfig.quantization allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization enforce_eager: bool = ModelConfig.enforce_eager - enable_return_routed_experts: bool = ModelConfig.enable_return_routed_experts disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field( MultiModalConfig, "limit_per_prompt" @@ -659,7 +658,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **model_kwargs["allow_deprecated_quantization"], ) model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"]) - model_group.add_argument("--enable-return-routed-experts", **model_kwargs["enable_return_routed_experts"]) + model_group.add_argument( + "--enable-return-routed-experts", **model_kwargs["enable_return_routed_experts"] + ) model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"]) model_group.add_argument( diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index cc64b8a07b69..fd7b45acf93c 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -438,6 +438,8 @@ def __init__( raise ValueError("Duplicate layer name: {}".format(prefix)) compilation_config.static_forward_context[prefix] = self self.layer_name = prefix + + # Delayed import to avoid circular dependency from vllm.model_executor.models.utils import extract_layer_index self.layer_id = extract_layer_index(self.layer_name) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 551f2007071b..26261200ce55 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -1,26 +1,38 @@ +import fcntl import logging -from abc import ABC +from abc import ABC, abstractmethod +from multiprocessing import shared_memory +from typing import Optional +from unittest.mock import patch + +import numpy as np import torch + from vllm.config import ModelConfig -from multiprocessing import shared_memory -import numpy as np -import fcntl -from unittest.mock import patch + logger = logging.getLogger(__name__) LOCK_FILE = "/tmp/vllm_routed_experts.lock" # Shared lock file path + def lock_file(fp): fcntl.flock(fp, fcntl.LOCK_EX) + def unlock_file(fp): fcntl.flock(fp, fcntl.LOCK_UN) -_global_experts_capturer = None + +# Global singleton instances (annotated) +_global_experts_capturer: Optional["RoutedExpertsCapturer"] = None +_global_experts_reader: Optional["RoutedExpertsReader"] = None + class RoutedExpertsCapturer(ABC): + """Abstract interface for capturer (host side).""" + @staticmethod - def create(enable: bool): + def create(enable: bool) -> "RoutedExpertsCapturer": """Create a global singleton instance""" global _global_experts_capturer if _global_experts_capturer is not None: @@ -33,30 +45,49 @@ def create(enable: bool): return _global_experts_capturer @staticmethod - def get_instance(): + def get_instance() -> Optional["RoutedExpertsCapturer"]: if _global_experts_capturer is None: logger.info("Experts capturer not initialized.") return _global_experts_capturer - def init_buffer(self, max_num_batched_tokens: int, max_num_kv_tokens: int, model_config: ModelConfig): + @abstractmethod + def init_buffer( + self, + max_num_batched_tokens: int, + max_num_kv_tokens: int, + model_config: ModelConfig, + enable_shared_memory: bool, + ) -> None: raise NotImplementedError - def capture(self, layer_id: int, topk_ids: torch.Tensor): + @abstractmethod + def capture(self, layer_id: int, topk_ids: torch.Tensor) -> None: raise NotImplementedError - def clear_buffer(self): + @abstractmethod + def clear_buffer(self) -> None: raise NotImplementedError - def save_captured_experts(self, indices: np.ndarray): + @abstractmethod + def save_captured_experts(self, indices: np.ndarray) -> None: raise NotImplementedError class _RoutedExpertsCapturerReal(RoutedExpertsCapturer): """Capturer for routed experts with host buffer""" - def __init__(self): - self._experts_capturer_device_buffer = None - def init_buffer(self, max_num_batched_tokens: int, max_num_kv_tokens: int, model_config: ModelConfig, enable_shared_memory: bool): + def __init__(self) -> None: + self._experts_capturer_device_buffer: torch.Tensor | None = None + self._shm: shared_memory.SharedMemory | None = None + self._host_buffer_view: np.ndarray | None = None + + def init_buffer( + self, + max_num_batched_tokens: int, + max_num_kv_tokens: int, + model_config: ModelConfig, + enable_shared_memory: bool, + ) -> None: if ( model_config.enable_return_routed_experts and self._experts_capturer_device_buffer is None @@ -72,81 +103,111 @@ def init_buffer(self, max_num_batched_tokens: int, max_num_kv_tokens: int, model ) if enable_shared_memory: - # Compute required shared memory size - shape = ( - max_num_kv_tokens, - model_config.hf_text_config.num_hidden_layers, - model_config.hf_text_config.num_experts_per_tok, - ) - nbytes = np.prod(shape) * np.dtype(np.int32).itemsize - - # 创建共享内存 - with open(LOCK_FILE, "wb") as lockfp: - self._shm = shared_memory.SharedMemory( - create=True, - size=nbytes, - name="vllm_routed_experts_buffer" # Fixed name for worker access - ) - - # 创建 numpy array 视图 - self._host_buffer_view = np.ndarray( - shape, dtype=np.int32, buffer=self._shm.buf - ) - self._host_buffer_view.fill(0) - - logger.debug( - f"Created shared memory buffer '{self._shm.name}' " - f"with shape {shape}" + # Compute required shared memory size + shape = ( + max_num_kv_tokens, + model_config.hf_text_config.num_hidden_layers, + model_config.hf_text_config.num_experts_per_tok, + ) + nbytes = int(np.prod(shape)) * np.dtype(np.int32).itemsize + + # 创建共享内存 + with open(LOCK_FILE, "wb") as fp: + lock_file(fp) + try: + # If already exists, SharedMemory(create=True) would raise. + # We assume capturer creates it first. + self._shm = shared_memory.SharedMemory( + create=True, + size=nbytes, + name="vllm_routed_experts_buffer", + ) + + # 创建 numpy array 视图 + self._host_buffer_view = np.ndarray( + shape, dtype=np.int32, buffer=self._shm.buf + ) + # 初始化为 0 + self._host_buffer_view.fill(0) + finally: + unlock_file(fp) + + # parameterized logging (avoid f-strings in logging) + logger.debug( + "Created shared memory buffer '%s' with shape %s", + self._shm.name if self._shm is not None else "None", + shape, ) else: self._shm = None self._host_buffer_view = None - def capture(self, layer_id: int, topk_ids: torch.Tensor): + def capture(self, layer_id: int, topk_ids: torch.Tensor) -> None: if self._experts_capturer_device_buffer is None: raise RuntimeError("Buffer not initialized.") batch_size, num_routed_experts = topk_ids.shape - self._experts_capturer_device_buffer[:batch_size, layer_id, : ] = topk_ids - - def clear_buffer(self): - self._experts_capturer_device_buffer.zero_() + # copy into device buffer (ensure shapes are compatible) + self._experts_capturer_device_buffer[:batch_size, layer_id, :] = topk_ids + def clear_buffer(self) -> None: + if self._experts_capturer_device_buffer is not None: + self._experts_capturer_device_buffer.zero_() - def save_captured_experts(self, indices: np.ndarray): - # Copy the entire batch from GPU to shared memory (via numpy view) + def save_captured_experts(self, indices: np.ndarray) -> None: + # Copy the entire batch from GPU to shared memory (via numpy view) with open(LOCK_FILE, "wb+") as fp: - if self._host_buffer_view is not None: - num_tokens = len(indices) - data = self._experts_capturer_device_buffer[:num_tokens, :, :].cpu().numpy() - self._host_buffer_view[indices, :, :] = data - - def __del__(self): - """Clean up shared memory""" - if self._shm is not None: - self._shm.close() - self._shm.unlink() # Delete shared memory - + lock_file(fp) + try: + if self._host_buffer_view is not None: + num_tokens = len(indices) + # Ensure device buffer exists + if self._experts_capturer_device_buffer is None: + raise RuntimeError("Device buffer not initialized.") + data = self._experts_capturer_device_buffer[ + :num_tokens, :, : + ].cpu().numpy() + # indices should be valid for host buffer + self._host_buffer_view[indices, :, :] = data + finally: + unlock_file(fp) + + def __del__(self) -> None: + """Clean up shared memory""" + try: + if self._shm is not None: + self._shm.close() + # Only creator should unlink + self._shm.unlink() + except Exception: + # Avoid raising in destructor + logger.debug("Exception during __del__ cleanup for capturer", exc_info=True) class _RoutedExpertsCapturerNoop(RoutedExpertsCapturer): - def init_buffer(self, max_num_batched_tokens: int, max_num_kv_tokens: int, model_config: ModelConfig, enable_shared_memory: bool): - pass + def init_buffer( + self, + max_num_batched_tokens: int, + max_num_kv_tokens: int, + model_config: ModelConfig, + enable_shared_memory: bool, + ) -> None: + return None - def capture(self, layer_id: int, topk_ids: torch.Tensor): - pass + def capture(self, layer_id: int, topk_ids: torch.Tensor) -> None: + return None - def clear_buffer(self): - pass - - def save_captured_experts(self, indices: np.ndarray): - pass + def clear_buffer(self) -> None: + return None + def save_captured_experts(self, indices: np.ndarray) -> None: + return None -_global_experts_reader = None class RoutedExpertsReader(ABC): + """Abstract interface for reader (worker side).""" + @staticmethod - def create(enable: bool): + def create(enable: bool) -> "RoutedExpertsReader": """Create a global singleton instance""" global _global_experts_reader if _global_experts_reader is not None: @@ -159,57 +220,82 @@ def create(enable: bool): return _global_experts_reader @staticmethod - def get_instance(): + def get_instance() -> Optional["RoutedExpertsReader"]: if _global_experts_reader is None: logger.info("Experts reader not initialized.") - # raise RuntimeError("Experts reader not initialized.") return _global_experts_reader - def get_routed_experts(self,num_tokens: int): + @abstractmethod + def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig) -> None: + raise NotImplementedError + + @abstractmethod + def get_routed_experts(self, indices: np.ndarray) -> np.ndarray | None: raise NotImplementedError -class _RoutedExpertsReaderReal: - """Reader class in worker process""" - def __init__(self): - self._shm = None - - def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig): - if self._shm is None: - shape = ( - max_num_kv_tokens, - model_config.hf_text_config.num_hidden_layers, - model_config.hf_text_config.num_experts_per_tok, - ) - - # Attach to existing shared memory +class _RoutedExpertsReaderReal(RoutedExpertsReader): + """Reader class in worker process""" + + def __init__(self) -> None: + self._shm: shared_memory.SharedMemory | None = None + self._host_buffer_view: np.ndarray | None = None + + def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig) -> None: + if self._shm is None: + shape = ( + max_num_kv_tokens, + model_config.hf_text_config.num_hidden_layers, + model_config.hf_text_config.num_experts_per_tok, + ) + + # Attach to existing shared memory with open(LOCK_FILE, "rb+") as fp: - with patch( - "multiprocessing.resource_tracker.register", - lambda *args, **kwargs: None, - ): - self._shm = shared_memory.SharedMemory( - name="vllm_routed_experts_buffer" - ) - - self._host_buffer_view = np.ndarray( - shape, dtype=np.int32, buffer=self._shm.buf - ) - - def get_routed_experts(self, indices: np.ndarray): - """Read data from shared memory, return routed experts for the corresponding request""" + lock_file(fp) + try: + # avoid resource_tracker registering the shared memory + with patch( + "multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None, + ): + # This will raise if the shared memory doesn't exist + self._shm = shared_memory.SharedMemory( + name="vllm_routed_experts_buffer" + ) + + self._host_buffer_view = np.ndarray( + shape, dtype=np.int32, buffer=self._shm.buf + ) + finally: + unlock_file(fp) + + def get_routed_experts(self, indices: np.ndarray) -> np.ndarray | None: + """ + Read routed expert data from shared memory for the given request. + """ + with open(LOCK_FILE, "rb+") as fp: - if self._host_buffer_view is None: - raise RuntimeError("Buffer not attached.") - return self._host_buffer_view[indices, :, :].copy() - - def __del__(self): - """Only close, do not delete shared memory""" - if self._shm is not None: - self._shm.close() # Note: reader does not call unlink() - -class _RoutedExpertsReaderNoop: - def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig): - pass - def get_routed_experts(self, indices: np.ndarray): - return None \ No newline at end of file + lock_file(fp) + try: + if self._host_buffer_view is None: + raise RuntimeError("Buffer not attached.") + # Return a copy to avoid referencing shared memory buffer directly + return self._host_buffer_view[indices, :, :].copy() + finally: + unlock_file(fp) + + def __del__(self) -> None: + """Only close, do not delete shared memory""" + try: + if self._shm is not None: + self._shm.close() # Note: reader does not call unlink() + except Exception: + logger.debug("Exception during __del__ cleanup for reader", exc_info=True) + + +class _RoutedExpertsReaderNoop(RoutedExpertsReader): + def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig) -> None: + return None + + def get_routed_experts(self, indices: np.ndarray) -> np.ndarray | None: + return None diff --git a/vllm/outputs.py b/vllm/outputs.py index f28b1c067a2f..d4e5d86b3a06 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -6,8 +6,8 @@ from dataclasses import dataclass from typing import Any, Generic -import torch import numpy as np +import torch from typing_extensions import TypeVar from vllm.logger import init_logger diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 7b3a90174ce9..92099818498d 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1,16 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import torch -import numpy as np -import json -import os - import itertools import time from collections import defaultdict from collections.abc import Iterable from typing import Any +import numpy as np from vllm import envs from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.config import VllmConfig @@ -29,6 +25,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsReader, +) from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import ( EncoderCacheManager, @@ -224,14 +223,21 @@ def __init__( ) self.use_pp = self.parallel_config.pipeline_parallel_size > 1 self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER - self.perf_metrics: ModelMetrics | None = None if self.log_stats and vllm_config.observability_config.enable_mfu_metrics: self.perf_metrics = ModelMetrics(vllm_config) - self.max_num_kv_tokens = ((kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups)) + 1)* self.block_size - self.routed_experts_reader = RoutedExpertsReader.create(enable=self.vllm_config.model_config.enable_return_routed_experts) - self.routed_experts_reader.attach_buffer(max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.vllm_config.model_config) + self.max_num_kv_tokens = ( + (kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups) + 1) + * self.block_size + ) + self.routed_experts_reader = RoutedExpertsReader.create( + enable=self.vllm_config.model_config.enable_return_routed_experts + ) + self.routed_experts_reader.attach_buffer( + max_num_kv_tokens=self.max_num_kv_tokens, + model_config=self.vllm_config.model_config, + ) def schedule(self) -> SchedulerOutput: # NOTE(woosuk) on the scheduling algorithm: @@ -1180,21 +1186,23 @@ def update_from_output( block_ids = kv_blocks.get_block_ids()[0] num_tokens = request.num_tokens-1 - # 计算slot mapping + # compute slot mapping block_ids_array = np.array(block_ids, dtype=np.int32) num_blocks = len(block_ids) block_size = self.block_size - # 生成block内偏移 + # generate block offsets block_offsets = np.arange(0, block_size) - # 计算slot mapping: slot = block_id * block_size + offset + # compute slot mapping: slot = block_id * block_size + offset slot_mapping = ( block_offsets.reshape((1, block_size)) + block_ids_array.reshape((num_blocks, 1)) * block_size ).flatten()[:num_tokens] - routed_experts = self.routed_experts_reader.get_routed_experts(slot_mapping) + routed_experts = self.routed_experts_reader.get_routed_experts( + indices=slot_mapping + ) kv_transfer_params = self._free_request(request) if status_before_stop == RequestStatus.RUNNING: stopped_running_reqs.add(request) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 9410c9a71ec6..0ffb97206c66 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -7,8 +7,9 @@ from typing import Any import msgspec -import torch import numpy as np +import torch + from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 61ccbb6d3d1a..f0ad0ae94da2 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -7,9 +7,10 @@ from dataclasses import dataclass from typing import Any, cast -import torch import numpy as np from vllm.lora.request import LoRARequest +import torch + from vllm.outputs import ( CompletionOutput, PoolingOutput, @@ -254,7 +255,9 @@ def make_request_output( finished, ) - output = self._new_completion_output(new_token_ids, finish_reason, stop_reason) + output = self._new_completion_output( + new_token_ids, finish_reason, stop_reason, routed_experts + ) if self.parent_req is None: outputs = [output] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7dfce7c87b2d..464bd89097b1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1630,7 +1630,7 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): if self.model_config.enable_return_routed_experts: assert len(self.kv_cache_config.kv_cache_groups) == 1 - self.slot_mapping = slot_mapping.cpu() + self.slot_mapping = slot_mapping.cpu().numpy() # Fill unused with -1. Needed for reshape_and_cache in full cuda # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID @@ -3494,7 +3494,10 @@ def propose_draft_token_ids(sampled_token_ids): self.eplb_step() with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"): - if self.model_config.enable_return_routed_experts and get_tensor_model_parallel_rank() == 0: + if ( + self.model_config.enable_return_routed_experts + and get_tensor_model_parallel_rank() == 0 + ): RoutedExpertsCapturer.get_instance().save_captured_experts(indices=self.slot_mapping) output = ModelRunnerOutput( @@ -5661,7 +5664,10 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: self.init_routed_experts_capturer() def init_routed_experts_capturer(self): - logger.info(f"Initializing routed experts capturer, enable_return_routed_experts: {self.model_config.enable_return_routed_experts}") + logger.info( + "Initializing routed experts capturer, enable_return_routed_experts: %s", + self.model_config.enable_return_routed_experts, + ) routed_experts_capturer = RoutedExpertsCapturer.create( self.model_config.enable_return_routed_experts ) From 411ea4331b96bf2fb8ad7560486ba910e508d804 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Fri, 7 Nov 2025 12:46:30 +0000 Subject: [PATCH 03/33] small fixs Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/config/vllm.py | 2 +- vllm/engine/arg_utils.py | 3 +- vllm/model_executor/layers/fused_moe/layer.py | 1 + .../fused_moe/routed_experts_capturer.py | 10 ++++-- vllm/outputs.py | 2 +- vllm/v1/core/sched/scheduler.py | 34 +++++++++---------- vllm/v1/engine/output_processor.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 11 ++++-- 8 files changed, 38 insertions(+), 27 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index e08cb0cd78a4..63bfd056bf31 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1352,7 +1352,7 @@ def __str__(self): f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa f"quantization={self.model_config.quantization}, " f"enforce_eager={self.model_config.enforce_eager}, " - f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " # noqa + f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " # noqa f"kv_cache_dtype={self.cache_config.cache_dtype}, " f"device_config={self.device_config.device}, " f"structured_outputs_config={self.structured_outputs_config!r}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5d57293558d2..7631cd61d1a2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -659,7 +659,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: ) model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"]) model_group.add_argument( - "--enable-return-routed-experts", **model_kwargs["enable_return_routed_experts"] + "--enable-return-routed-experts", + **model_kwargs["enable_return_routed_experts"], ) model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"]) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index fd7b45acf93c..0b9540898df6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -441,6 +441,7 @@ def __init__( # Delayed import to avoid circular dependency from vllm.model_executor.models.utils import extract_layer_index + self.layer_id = extract_layer_index(self.layer_name) self.enable_eplb = enable_eplb diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 26261200ce55..b0b864ae42da 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fcntl import logging from abc import ABC, abstractmethod @@ -163,9 +165,11 @@ def save_captured_experts(self, indices: np.ndarray) -> None: # Ensure device buffer exists if self._experts_capturer_device_buffer is None: raise RuntimeError("Device buffer not initialized.") - data = self._experts_capturer_device_buffer[ - :num_tokens, :, : - ].cpu().numpy() + data = ( + self._experts_capturer_device_buffer[:num_tokens, :, :] + .cpu() + .numpy() + ) # indices should be valid for host buffer self._host_buffer_view[indices, :, :] = data finally: diff --git a/vllm/outputs.py b/vllm/outputs.py index d4e5d86b3a06..26ec97081038 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -41,7 +41,7 @@ class CompletionOutput: index: int text: str token_ids: GenericSequence[int] - routed_experts: np.ndarray | None # [seq_len,layer_num,topk] + routed_experts: np.ndarray | None # [seq_len,layer_num,topk] cumulative_logprob: float | None logprobs: SampleLogprobs | None finish_reason: str | None = None diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 92099818498d..6fd62ac585b6 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -228,9 +228,9 @@ def __init__( self.perf_metrics = ModelMetrics(vllm_config) self.max_num_kv_tokens = ( - (kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups) + 1) - * self.block_size - ) + kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups) + 1 + ) * self.block_size + self.routed_experts_reader = RoutedExpertsReader.create( enable=self.vllm_config.model_config.enable_return_routed_experts ) @@ -1182,23 +1182,23 @@ def update_from_output( if self.vllm_config.model_config.enable_return_routed_experts: assert len(self.kv_cache_config.kv_cache_groups) == 1 - kv_blocks = self.kv_cache_manager.get_blocks(request.request_id) - block_ids = kv_blocks.get_block_ids()[0] - num_tokens = request.num_tokens-1 + kv_blocks = self.kv_cache_manager.get_blocks(request.request_id) + block_ids = kv_blocks.get_block_ids()[0] + num_tokens = request.num_tokens - 1 - # compute slot mapping - block_ids_array = np.array(block_ids, dtype=np.int32) - num_blocks = len(block_ids) - block_size = self.block_size + # compute slot mapping + block_ids_array = np.array(block_ids, dtype=np.int32) + num_blocks = len(block_ids) + block_size = self.block_size - # generate block offsets - block_offsets = np.arange(0, block_size) + # generate block offsets + block_offsets = np.arange(0, block_size) - # compute slot mapping: slot = block_id * block_size + offset - slot_mapping = ( - block_offsets.reshape((1, block_size)) - + block_ids_array.reshape((num_blocks, 1)) * block_size - ).flatten()[:num_tokens] + # compute slot mapping: slot = block_id * block_size + offset + slot_mapping = ( + block_offsets.reshape((1, block_size)) + + block_ids_array.reshape((num_blocks, 1)) * block_size + ).flatten()[:num_tokens] routed_experts = self.routed_experts_reader.get_routed_experts( indices=slot_mapping diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index f0ad0ae94da2..80bffa756e40 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -559,7 +559,7 @@ def process_outputs( finish_reason, stop_reason, kv_transfer_params, - routed_experts + routed_experts, ): if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 464bd89097b1..d3fe8fcc0eae 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3498,7 +3498,9 @@ def propose_draft_token_ids(sampled_token_ids): self.model_config.enable_return_routed_experts and get_tensor_model_parallel_rank() == 0 ): - RoutedExpertsCapturer.get_instance().save_captured_experts(indices=self.slot_mapping) + RoutedExpertsCapturer.get_instance().save_captured_experts( + indices=self.slot_mapping + ) output = ModelRunnerOutput( req_ids=req_ids_output_copy, @@ -5672,12 +5674,15 @@ def init_routed_experts_capturer(self): self.model_config.enable_return_routed_experts ) block_size = self.cache_config.block_size - self.max_num_kv_tokens = ((self.kv_cache_config.num_blocks // len(self.kv_cache_config.kv_cache_groups)) + 1) * block_size + self.max_num_kv_tokens = ( + self.kv_cache_config.num_blocks // len(self.kv_cache_config.kv_cache_groups) + + 1 + ) * block_size routed_experts_capturer.init_buffer( max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens, max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.model_config, - enable_shared_memory= get_tensor_model_parallel_rank() == 0 + enable_shared_memory=get_tensor_model_parallel_rank() == 0, ) def may_add_encoder_only_layers_to_kv_cache_config(self) -> None: From fe28549a6114f9b332069d47c75b667355f70a94 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Wed, 12 Nov 2025 06:57:15 +0000 Subject: [PATCH 04/33] merge main Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/model_executor/layers/fused_moe/layer.py | 6 +++--- vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 12 +++++++----- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 0b9540898df6..b7f13fbca128 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -35,6 +35,9 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( init_aiter_topK_meta_data, ) +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer +) from vllm.model_executor.layers.fused_moe.routing_simulator import RoutingSimulator from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, @@ -48,9 +51,6 @@ direct_register_custom_op, ) from vllm.v1.worker.ubatching import dbo_current_ubatch_id -from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( - RoutedExpertsCapturer -) if current_platform.is_cuda_alike(): from .fused_moe import eplb_map_to_physical_and_record diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 6fd62ac585b6..2dbd8f39d418 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -57,7 +57,7 @@ from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.utils import record_function_or_nullcontext -from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsReader + logger = init_logger(__name__) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d3fe8fcc0eae..02b9c9986bfa 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -31,6 +31,7 @@ get_layers_from_vllm_config, update_config, ) +from vllm.distributed import get_tensor_model_parallel_rank from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer from vllm.distributed.eplb.eplb_state import EplbState from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group @@ -54,6 +55,10 @@ MRotaryEmbedding, XDRotaryEmbedding, ) +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer +) +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import ( MultiModalEmbeddings, @@ -173,9 +178,6 @@ bind_kv_cache, sanity_check_mm_encoder_outputs, ) -from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( - RoutedExpertsCapturer) -from vllm.distributed import get_tensor_model_parallel_rank if TYPE_CHECKING: from vllm.model_executor.model_loader.tensorizer import TensorizerConfig @@ -3499,8 +3501,8 @@ def propose_draft_token_ids(sampled_token_ids): and get_tensor_model_parallel_rank() == 0 ): RoutedExpertsCapturer.get_instance().save_captured_experts( - indices=self.slot_mapping - ) + indices=self.slot_mapping + ) output = ModelRunnerOutput( req_ids=req_ids_output_copy, From 5bf3ef7428f82afb8d46e6cfc07baf24d265349a Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Wed, 12 Nov 2025 08:42:24 +0000 Subject: [PATCH 05/33] small fix Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b7f13fbca128..72924ee276b6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -36,7 +36,7 @@ init_aiter_topK_meta_data, ) from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( - RoutedExpertsCapturer + RoutedExpertsCapturer, ) from vllm.model_executor.layers.fused_moe.routing_simulator import RoutingSimulator from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 02b9c9986bfa..1bfc4c8e9d4d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -56,7 +56,7 @@ XDRotaryEmbedding, ) from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( - RoutedExpertsCapturer + RoutedExpertsCapturer, ) from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader From 2119bad04011c0d8ab0afa38af8dffde4ad73667 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Mon, 17 Nov 2025 08:50:58 +0000 Subject: [PATCH 06/33] update Signed-off-by: xhx1022 <1737006628@qq.com> --- .../fused_moe/routed_experts_capturer.py | 29 +++++++++++-------- vllm/v1/core/sched/scheduler.py | 1 + vllm/v1/worker/gpu_model_runner.py | 1 + 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index b0b864ae42da..70b3b33b84bd 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -14,7 +14,8 @@ logger = logging.getLogger(__name__) -LOCK_FILE = "/tmp/vllm_routed_experts.lock" # Shared lock file path +LOCK_FILE_PREFIX = "/tmp/vllm_routed_experts" # Shared lock file path +BUFFER_PREFIX = "vllm_routed_experts_buffer" def lock_file(fp): @@ -48,8 +49,6 @@ def create(enable: bool) -> "RoutedExpertsCapturer": @staticmethod def get_instance() -> Optional["RoutedExpertsCapturer"]: - if _global_experts_capturer is None: - logger.info("Experts capturer not initialized.") return _global_experts_capturer @abstractmethod @@ -58,6 +57,7 @@ def init_buffer( max_num_batched_tokens: int, max_num_kv_tokens: int, model_config: ModelConfig, + instance_id: str, enable_shared_memory: bool, ) -> None: raise NotImplementedError @@ -88,6 +88,7 @@ def init_buffer( max_num_batched_tokens: int, max_num_kv_tokens: int, model_config: ModelConfig, + instance_id: str, enable_shared_memory: bool, ) -> None: if ( @@ -112,9 +113,10 @@ def init_buffer( model_config.hf_text_config.num_experts_per_tok, ) nbytes = int(np.prod(shape)) * np.dtype(np.int32).itemsize + self.lock_file = f"{LOCK_FILE_PREFIX}.{instance_id}.lock" # 创建共享内存 - with open(LOCK_FILE, "wb") as fp: + with open(self.lock_file, "wb") as fp: lock_file(fp) try: # If already exists, SharedMemory(create=True) would raise. @@ -122,7 +124,7 @@ def init_buffer( self._shm = shared_memory.SharedMemory( create=True, size=nbytes, - name="vllm_routed_experts_buffer", + name=f"{BUFFER_PREFIX}_{instance_id}", ) # 创建 numpy array 视图 @@ -157,7 +159,7 @@ def clear_buffer(self) -> None: def save_captured_experts(self, indices: np.ndarray) -> None: # Copy the entire batch from GPU to shared memory (via numpy view) - with open(LOCK_FILE, "wb+") as fp: + with open(self.lock_file, "wb+") as fp: lock_file(fp) try: if self._host_buffer_view is not None: @@ -193,6 +195,7 @@ def init_buffer( max_num_batched_tokens: int, max_num_kv_tokens: int, model_config: ModelConfig, + instance_id: str, enable_shared_memory: bool, ) -> None: return None @@ -230,7 +233,7 @@ def get_instance() -> Optional["RoutedExpertsReader"]: return _global_experts_reader @abstractmethod - def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig) -> None: + def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str) -> None: raise NotImplementedError @abstractmethod @@ -245,7 +248,7 @@ def __init__(self) -> None: self._shm: shared_memory.SharedMemory | None = None self._host_buffer_view: np.ndarray | None = None - def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig) -> None: + def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str) -> None: if self._shm is None: shape = ( max_num_kv_tokens, @@ -253,8 +256,10 @@ def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig) -> No model_config.hf_text_config.num_experts_per_tok, ) + self.lock_file = f"{LOCK_FILE_PREFIX}.{instance_id}.lock" + # Attach to existing shared memory - with open(LOCK_FILE, "rb+") as fp: + with open(self.lock_file, "rb+") as fp: lock_file(fp) try: # avoid resource_tracker registering the shared memory @@ -264,7 +269,7 @@ def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig) -> No ): # This will raise if the shared memory doesn't exist self._shm = shared_memory.SharedMemory( - name="vllm_routed_experts_buffer" + name=f"{BUFFER_PREFIX}_{instance_id}" ) self._host_buffer_view = np.ndarray( @@ -278,7 +283,7 @@ def get_routed_experts(self, indices: np.ndarray) -> np.ndarray | None: Read routed expert data from shared memory for the given request. """ - with open(LOCK_FILE, "rb+") as fp: + with open(self.lock_file, "rb+") as fp: lock_file(fp) try: if self._host_buffer_view is None: @@ -298,7 +303,7 @@ def __del__(self) -> None: class _RoutedExpertsReaderNoop(RoutedExpertsReader): - def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig) -> None: + def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str) -> None: return None def get_routed_experts(self, indices: np.ndarray) -> np.ndarray | None: diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 2dbd8f39d418..7111357f6420 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -237,6 +237,7 @@ def __init__( self.routed_experts_reader.attach_buffer( max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.vllm_config.model_config, + instance_id=self.vllm_config.instance_id, ) def schedule(self) -> SchedulerOutput: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1bfc4c8e9d4d..f000585e14c1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5684,6 +5684,7 @@ def init_routed_experts_capturer(self): max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens, max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.model_config, + instance_id=self.vllm_config.instance_id, enable_shared_memory=get_tensor_model_parallel_rank() == 0, ) From 9567fe4d7422197eae503cb91a5665856370a924 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Tue, 18 Nov 2025 09:28:10 +0000 Subject: [PATCH 07/33] update Signed-off-by: xhx1022 <1737006628@qq.com> --- .../layers/fused_moe/routed_experts_capturer.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 70b3b33b84bd..afdf7afd1ccd 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -113,7 +113,7 @@ def init_buffer( model_config.hf_text_config.num_experts_per_tok, ) nbytes = int(np.prod(shape)) * np.dtype(np.int32).itemsize - self.lock_file = f"{LOCK_FILE_PREFIX}.{instance_id}.lock" + self.lock_file = f"{LOCK_FILE_PREFIX}_{instance_id}.lock" # 创建共享内存 with open(self.lock_file, "wb") as fp: @@ -233,7 +233,9 @@ def get_instance() -> Optional["RoutedExpertsReader"]: return _global_experts_reader @abstractmethod - def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str) -> None: + def attach_buffer( + self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str + ) -> None: raise NotImplementedError @abstractmethod @@ -248,7 +250,9 @@ def __init__(self) -> None: self._shm: shared_memory.SharedMemory | None = None self._host_buffer_view: np.ndarray | None = None - def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str) -> None: + def attach_buffer( + self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str + ) -> None: if self._shm is None: shape = ( max_num_kv_tokens, @@ -256,7 +260,7 @@ def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig, insta model_config.hf_text_config.num_experts_per_tok, ) - self.lock_file = f"{LOCK_FILE_PREFIX}.{instance_id}.lock" + self.lock_file = f"{LOCK_FILE_PREFIX}_{instance_id}.lock" # Attach to existing shared memory with open(self.lock_file, "rb+") as fp: @@ -303,7 +307,9 @@ def __del__(self) -> None: class _RoutedExpertsReaderNoop(RoutedExpertsReader): - def attach_buffer(self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str) -> None: + def attach_buffer( + self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str + ) -> None: return None def get_routed_experts(self, indices: np.ndarray) -> np.ndarray | None: From 58e39b3d2980929640222ed0bd2af26a1f403347 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Tue, 25 Nov 2025 19:55:21 +0800 Subject: [PATCH 08/33] fixed verl sync mode Signed-off-by: xhx1022 <1737006628@qq.com> --- .../fused_moe/routed_experts_capturer.py | 42 +++++++++++-------- vllm/v1/core/sched/scheduler.py | 3 +- vllm/v1/worker/gpu_model_runner.py | 3 +- 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index afdf7afd1ccd..1f64689f6bf2 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -112,29 +112,35 @@ def init_buffer( model_config.hf_text_config.num_hidden_layers, model_config.hf_text_config.num_experts_per_tok, ) - nbytes = int(np.prod(shape)) * np.dtype(np.int32).itemsize + self.dest_size = int(np.prod(shape)) * np.dtype(np.int32).itemsize self.lock_file = f"{LOCK_FILE_PREFIX}_{instance_id}.lock" + self.shm_name = f"{BUFFER_PREFIX}_{instance_id}" - # 创建共享内存 with open(self.lock_file, "wb") as fp: lock_file(fp) try: - # If already exists, SharedMemory(create=True) would raise. - # We assume capturer creates it first. - self._shm = shared_memory.SharedMemory( - create=True, - size=nbytes, - name=f"{BUFFER_PREFIX}_{instance_id}", - ) - - # 创建 numpy array 视图 - self._host_buffer_view = np.ndarray( - shape, dtype=np.int32, buffer=self._shm.buf - ) - # 初始化为 0 - self._host_buffer_view.fill(0) - finally: - unlock_file(fp) + shm = shared_memory.SharedMemory(name=self.shm_name, create=True, size=self.dest_size) + except: + shm = shared_memory.SharedMemory(name=self.shm_name, create=False, size=self.dest_size) + + if shm.size != self.dest_size: + logger.warning(f"size not same, unlink shm {self.shm_name} and create again") + shm.close() + shm.unlink() + try: + shm = shared_memory.SharedMemory(name=self.shm_name, create=True, size=self.dest_size) + logger.info(f"create shm {self.shm_name}") + except: + shm = shared_memory.SharedMemory(name=self.shm_name, create=False, size=self.dest_size) + logger.info(f"link shm {self.shm_name}") + + self._shm = shm + self._host_buffer_view = np.ndarray( + shape, dtype=np.int32, buffer=self._shm.buf + ) + # init 0 + self._host_buffer_view.fill(0) + unlock_file(fp) # parameterized logging (avoid f-strings in logging) logger.debug( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 7111357f6420..ae969ca77b21 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -234,10 +234,11 @@ def __init__( self.routed_experts_reader = RoutedExpertsReader.create( enable=self.vllm_config.model_config.enable_return_routed_experts ) + self.instance_id = f"rank_{vllm_config.parallel_config.rank // vllm_config.parallel_config.world_size}" self.routed_experts_reader.attach_buffer( max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.vllm_config.model_config, - instance_id=self.vllm_config.instance_id, + instance_id=self.instance_id, ) def schedule(self) -> SchedulerOutput: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f000585e14c1..606b75c21a0c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5680,11 +5680,12 @@ def init_routed_experts_capturer(self): self.kv_cache_config.num_blocks // len(self.kv_cache_config.kv_cache_groups) + 1 ) * block_size + self.instance_id = f"rank_{self.vllm_config.parallel_config.rank // self.vllm_config.parallel_config.world_size}" routed_experts_capturer.init_buffer( max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens, max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.model_config, - instance_id=self.vllm_config.instance_id, + instance_id=self.instance_id, enable_shared_memory=get_tensor_model_parallel_rank() == 0, ) From 173220551e16aa31ddebbcba32154fbd314f8e62 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Mon, 15 Dec 2025 07:21:28 +0000 Subject: [PATCH 09/33] update Signed-off-by: xhx1022 <1737006628@qq.com> --- .../fused_moe/routed_experts_capturer.py | 37 ++++++++++++------- vllm/v1/core/sched/scheduler.py | 5 ++- vllm/v1/worker/gpu_model_runner.py | 11 +++--- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 1f64689f6bf2..dacf11053151 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -101,7 +101,7 @@ def init_buffer( model_config.hf_text_config.num_hidden_layers, model_config.hf_text_config.num_experts_per_tok, ), - dtype=torch.int32, + dtype=torch.int16, device="cuda", ) @@ -112,31 +112,42 @@ def init_buffer( model_config.hf_text_config.num_hidden_layers, model_config.hf_text_config.num_experts_per_tok, ) - self.dest_size = int(np.prod(shape)) * np.dtype(np.int32).itemsize + self.dest_size = int(np.prod(shape)) * np.dtype(np.int16).itemsize self.lock_file = f"{LOCK_FILE_PREFIX}_{instance_id}.lock" self.shm_name = f"{BUFFER_PREFIX}_{instance_id}" with open(self.lock_file, "wb") as fp: lock_file(fp) try: - shm = shared_memory.SharedMemory(name=self.shm_name, create=True, size=self.dest_size) - except: - shm = shared_memory.SharedMemory(name=self.shm_name, create=False, size=self.dest_size) + shm = shared_memory.SharedMemory( + name=self.shm_name, create=True, size=self.dest_size + ) + except FileExistsError: + shm = shared_memory.SharedMemory( + name=self.shm_name, create=False, size=self.dest_size + ) if shm.size != self.dest_size: - logger.warning(f"size not same, unlink shm {self.shm_name} and create again") + logger.warning( + "Shared memory %s size mismatch; recreate", + self.shm_name, + ) shm.close() shm.unlink() try: - shm = shared_memory.SharedMemory(name=self.shm_name, create=True, size=self.dest_size) - logger.info(f"create shm {self.shm_name}") - except: - shm = shared_memory.SharedMemory(name=self.shm_name, create=False, size=self.dest_size) - logger.info(f"link shm {self.shm_name}") + shm = shared_memory.SharedMemory( + name=self.shm_name, create=True, size=self.dest_size + ) + logger.info("Create shared memory %s", self.shm_name) + except FileExistsError: + shm = shared_memory.SharedMemory( + name=self.shm_name, create=False, size=self.dest_size + ) + logger.info("Link shared memory %s", self.shm_name) self._shm = shm self._host_buffer_view = np.ndarray( - shape, dtype=np.int32, buffer=self._shm.buf + shape, dtype=np.int16, buffer=self._shm.buf ) # init 0 self._host_buffer_view.fill(0) @@ -283,7 +294,7 @@ def attach_buffer( ) self._host_buffer_view = np.ndarray( - shape, dtype=np.int32, buffer=self._shm.buf + shape, dtype=np.int16, buffer=self._shm.buf ) finally: unlock_file(fp) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ae969ca77b21..0b9f1022c4c2 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -7,6 +7,7 @@ from typing import Any import numpy as np + from vllm import envs from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.config import VllmConfig @@ -234,7 +235,9 @@ def __init__( self.routed_experts_reader = RoutedExpertsReader.create( enable=self.vllm_config.model_config.enable_return_routed_experts ) - self.instance_id = f"rank_{vllm_config.parallel_config.rank // vllm_config.parallel_config.world_size}" + rank = vllm_config.parallel_config.rank + world_size = vllm_config.parallel_config.world_size + self.instance_id = f"rank_{rank // world_size}" self.routed_experts_reader.attach_buffer( max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.vllm_config.model_config, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 606b75c21a0c..4428b354d9f0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -51,14 +51,13 @@ from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping, LoRAMappingType from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer, +) from vllm.model_executor.layers.rotary_embedding import ( MRotaryEmbedding, XDRotaryEmbedding, ) -from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( - RoutedExpertsCapturer, -) -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import ( MultiModalEmbeddings, @@ -5680,7 +5679,9 @@ def init_routed_experts_capturer(self): self.kv_cache_config.num_blocks // len(self.kv_cache_config.kv_cache_groups) + 1 ) * block_size - self.instance_id = f"rank_{self.vllm_config.parallel_config.rank // self.vllm_config.parallel_config.world_size}" + rank = self.vllm_config.parallel_config.rank + world_size = self.vllm_config.parallel_config.world_size + self.instance_id = f"rank_{rank // world_size}" routed_experts_capturer.init_buffer( max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens, max_num_kv_tokens=self.max_num_kv_tokens, From 27170a022db93821e1865530f25f8ca613faaaba Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Mon, 15 Dec 2025 09:49:30 +0000 Subject: [PATCH 10/33] update Signed-off-by: xhx1022 <1737006628@qq.com> --- .../gptq_marlin/kernel_bf16_kfe2m1f.cu | 69 ++++++++ .../gptq_marlin/kernel_bf16_kfe4m3fn.cu | 69 ++++++++ .../gptq_marlin/kernel_bf16_ku4.cu | 129 ++++++++++++++ .../gptq_marlin/kernel_bf16_ku4b8.cu | 159 ++++++++++++++++++ .../gptq_marlin/kernel_bf16_ku8b128.cu | 159 ++++++++++++++++++ .../gptq_marlin/kernel_fp16_kfe2m1f.cu | 39 +++++ .../gptq_marlin/kernel_fp16_kfe4m3fn.cu | 69 ++++++++ .../gptq_marlin/kernel_fp16_ku4.cu | 159 ++++++++++++++++++ .../gptq_marlin/kernel_fp16_ku4b8.cu | 159 ++++++++++++++++++ .../gptq_marlin/kernel_fp16_ku8b128.cu | 159 ++++++++++++++++++ 10 files changed, 1170 insertions(+) create mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_kfe2m1f.cu create mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_kfe4m3fn.cu create mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_ku4.cu create mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_ku4b8.cu create mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_ku8b128.cu create mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_kfe2m1f.cu create mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_kfe4m3fn.cu create mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_ku4.cu create mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_ku4b8.cu create mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_ku8b128.cu diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_kfe2m1f.cu b/csrc/quantization/gptq_marlin/kernel_bf16_kfe2m1f.cu new file mode 100644 index 000000000000..cdd8472c4b48 --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_bf16_kfe2m1f.cu @@ -0,0 +1,69 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_kfe4m3fn.cu b/csrc/quantization/gptq_marlin/kernel_bf16_kfe4m3fn.cu new file mode 100644 index 000000000000..8128dbb570bf --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_bf16_kfe4m3fn.cu @@ -0,0 +1,69 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_ku4.cu b/csrc/quantization/gptq_marlin/kernel_bf16_ku4.cu new file mode 100644 index 000000000000..87ca117660ee --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_bf16_ku4.cu @@ -0,0 +1,129 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_ku4b8.cu b/csrc/quantization/gptq_marlin/kernel_bf16_ku4b8.cu new file mode 100644 index 000000000000..9c0e8dacb578 --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_bf16_ku4b8.cu @@ -0,0 +1,159 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_ku8b128.cu b/csrc/quantization/gptq_marlin/kernel_bf16_ku8b128.cu new file mode 100644 index 000000000000..ac13dd97a3ed --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_bf16_ku8b128.cu @@ -0,0 +1,159 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_kfe2m1f.cu b/csrc/quantization/gptq_marlin/kernel_fp16_kfe2m1f.cu new file mode 100644 index 000000000000..aba349be6736 --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_fp16_kfe2m1f.cu @@ -0,0 +1,39 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_kfe4m3fn.cu b/csrc/quantization/gptq_marlin/kernel_fp16_kfe4m3fn.cu new file mode 100644 index 000000000000..25934db9d5de --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_fp16_kfe4m3fn.cu @@ -0,0 +1,69 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_ku4.cu b/csrc/quantization/gptq_marlin/kernel_fp16_ku4.cu new file mode 100644 index 000000000000..3d81ae7674f9 --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_fp16_ku4.cu @@ -0,0 +1,159 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_ku4b8.cu b/csrc/quantization/gptq_marlin/kernel_fp16_ku4b8.cu new file mode 100644 index 000000000000..87b9b4a4c36c --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_fp16_ku4b8.cu @@ -0,0 +1,159 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_ku8b128.cu b/csrc/quantization/gptq_marlin/kernel_fp16_ku8b128.cu new file mode 100644 index 000000000000..7f1e3fab3ed7 --- /dev/null +++ b/csrc/quantization/gptq_marlin/kernel_fp16_ku8b128.cu @@ -0,0 +1,159 @@ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); + +} From 4cb1aa82ed2a3ff02cbd63e37f937699270273a9 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Mon, 15 Dec 2025 09:51:03 +0000 Subject: [PATCH 11/33] Revert "update" This reverts commit 33cbc736fb7cc10d34a8130e09561e68dfc2316d. Signed-off-by: xhx1022 <1737006628@qq.com> --- .../gptq_marlin/kernel_bf16_kfe2m1f.cu | 69 -------- .../gptq_marlin/kernel_bf16_kfe4m3fn.cu | 69 -------- .../gptq_marlin/kernel_bf16_ku4.cu | 129 -------------- .../gptq_marlin/kernel_bf16_ku4b8.cu | 159 ------------------ .../gptq_marlin/kernel_bf16_ku8b128.cu | 159 ------------------ .../gptq_marlin/kernel_fp16_kfe2m1f.cu | 39 ----- .../gptq_marlin/kernel_fp16_kfe4m3fn.cu | 69 -------- .../gptq_marlin/kernel_fp16_ku4.cu | 159 ------------------ .../gptq_marlin/kernel_fp16_ku4b8.cu | 159 ------------------ .../gptq_marlin/kernel_fp16_ku8b128.cu | 159 ------------------ 10 files changed, 1170 deletions(-) delete mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_kfe2m1f.cu delete mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_kfe4m3fn.cu delete mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_ku4.cu delete mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_ku4b8.cu delete mode 100644 csrc/quantization/gptq_marlin/kernel_bf16_ku8b128.cu delete mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_kfe2m1f.cu delete mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_kfe4m3fn.cu delete mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_ku4.cu delete mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_ku4b8.cu delete mode 100644 csrc/quantization/gptq_marlin/kernel_fp16_ku8b128.cu diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_kfe2m1f.cu b/csrc/quantization/gptq_marlin/kernel_bf16_kfe2m1f.cu deleted file mode 100644 index cdd8472c4b48..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_bf16_kfe2m1f.cu +++ /dev/null @@ -1,69 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_kfe4m3fn.cu b/csrc/quantization/gptq_marlin/kernel_bf16_kfe4m3fn.cu deleted file mode 100644 index 8128dbb570bf..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_bf16_kfe4m3fn.cu +++ /dev/null @@ -1,69 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_ku4.cu b/csrc/quantization/gptq_marlin/kernel_bf16_ku4.cu deleted file mode 100644 index 87ca117660ee..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_bf16_ku4.cu +++ /dev/null @@ -1,129 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_ku4b8.cu b/csrc/quantization/gptq_marlin/kernel_bf16_ku4b8.cu deleted file mode 100644 index 9c0e8dacb578..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_bf16_ku4b8.cu +++ /dev/null @@ -1,159 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/csrc/quantization/gptq_marlin/kernel_bf16_ku8b128.cu b/csrc/quantization/gptq_marlin/kernel_bf16_ku8b128.cu deleted file mode 100644 index ac13dd97a3ed..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_bf16_ku8b128.cu +++ /dev/null @@ -1,159 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_kfe2m1f.cu b/csrc/quantization/gptq_marlin/kernel_fp16_kfe2m1f.cu deleted file mode 100644 index aba349be6736..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_fp16_kfe2m1f.cu +++ /dev/null @@ -1,39 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_kfe4m3fn.cu b/csrc/quantization/gptq_marlin/kernel_fp16_kfe4m3fn.cu deleted file mode 100644 index 25934db9d5de..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_fp16_kfe4m3fn.cu +++ /dev/null @@ -1,69 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_ku4.cu b/csrc/quantization/gptq_marlin/kernel_fp16_ku4.cu deleted file mode 100644 index 3d81ae7674f9..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_fp16_ku4.cu +++ /dev/null @@ -1,159 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_ku4b8.cu b/csrc/quantization/gptq_marlin/kernel_fp16_ku4b8.cu deleted file mode 100644 index 87b9b4a4c36c..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_fp16_ku4b8.cu +++ /dev/null @@ -1,159 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/csrc/quantization/gptq_marlin/kernel_fp16_ku8b128.cu b/csrc/quantization/gptq_marlin/kernel_fp16_ku8b128.cu deleted file mode 100644 index 7f1e3fab3ed7..000000000000 --- a/csrc/quantization/gptq_marlin/kernel_fp16_ku8b128.cu +++ /dev/null @@ -1,159 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} From 2cb3259e83f9d38be06687118a25489d965ddd02 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Tue, 16 Dec 2025 07:37:15 +0000 Subject: [PATCH 12/33] fixed bugs Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/outputs.py | 2 +- vllm/v1/core/sched/scheduler.py | 11 ++++++++--- vllm/v1/worker/gpu_model_runner.py | 11 ++++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/vllm/outputs.py b/vllm/outputs.py index 26ec97081038..cf23745c447d 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -41,9 +41,9 @@ class CompletionOutput: index: int text: str token_ids: GenericSequence[int] - routed_experts: np.ndarray | None # [seq_len,layer_num,topk] cumulative_logprob: float | None logprobs: SampleLogprobs | None + routed_experts: np.ndarray | None = None # [seq_len,layer_num,topk] finish_reason: str | None = None stop_reason: int | str | None = None lora_request: LoRARequest | None = None diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 0b9f1022c4c2..b46fc39b568b 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -235,9 +235,14 @@ def __init__( self.routed_experts_reader = RoutedExpertsReader.create( enable=self.vllm_config.model_config.enable_return_routed_experts ) - rank = vllm_config.parallel_config.rank - world_size = vllm_config.parallel_config.world_size - self.instance_id = f"rank_{rank // world_size}" + + if ":" in self.vllm_config.instance_id: # for async mode in verl + self.instance_id = self.vllm_config.instance_id.rsplit(":", 1)[-1] + else: # sync mode in verl + rank = self.vllm_config.parallel_config.rank + world_size = self.vllm_config.parallel_config.world_size + self.instance_id = f"rank_{rank // world_size}" + self.routed_experts_reader.attach_buffer( max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.vllm_config.model_config, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4428b354d9f0..8135ce6e6650 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5679,9 +5679,14 @@ def init_routed_experts_capturer(self): self.kv_cache_config.num_blocks // len(self.kv_cache_config.kv_cache_groups) + 1 ) * block_size - rank = self.vllm_config.parallel_config.rank - world_size = self.vllm_config.parallel_config.world_size - self.instance_id = f"rank_{rank // world_size}" + + if ":" in self.vllm_config.instance_id: # for async mode in verl + self.instance_id = self.vllm_config.instance_id.rsplit(":", 1)[-1] + else: # sync mode in verl + rank = self.vllm_config.parallel_config.rank + world_size = self.vllm_config.parallel_config.world_size + self.instance_id = f"rank_{rank // world_size}" + routed_experts_capturer.init_buffer( max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens, max_num_kv_tokens=self.max_num_kv_tokens, From 8df9d70aaf88c89749306864cb2b0b5b42b380b4 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Tue, 16 Dec 2025 08:17:11 +0000 Subject: [PATCH 13/33] hybraid atten Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/v1/core/sched/scheduler.py | 2 -- vllm/v1/worker/gpu_model_runner.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index b46fc39b568b..7bb395fe5bd0 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1190,8 +1190,6 @@ def update_from_output( routed_experts = None if stopped: if self.vllm_config.model_config.enable_return_routed_experts: - assert len(self.kv_cache_config.kv_cache_groups) == 1 - kv_blocks = self.kv_cache_manager.get_blocks(request.request_id) block_ids = kv_blocks.get_block_ids()[0] num_tokens = request.num_tokens - 1 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8135ce6e6650..2a0660cfe03f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1629,8 +1629,7 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded) slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded] - if self.model_config.enable_return_routed_experts: - assert len(self.kv_cache_config.kv_cache_groups) == 1 + if self.model_config.enable_return_routed_experts and kv_cache_gid == 0: self.slot_mapping = slot_mapping.cpu().numpy() # Fill unused with -1. Needed for reshape_and_cache in full cuda From bfa1b2bb3d9d34b543e538070ba3fed19e3e2bd0 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Tue, 16 Dec 2025 08:28:06 +0000 Subject: [PATCH 14/33] format Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/v1/worker/gpu_model_runner.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2a0660cfe03f..63940e700aaa 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3119,7 +3119,11 @@ def execute_model( "after execute_model() returns None." ) - RoutedExpertsCapturer.get_instance().clear_buffer() + capturer = RoutedExpertsCapturer.get_instance() + if capturer is not None: + capturer.clear_buffer() # noqa + else: + logger.error("RoutedExpertsCapturer not initialized.") if scheduler_output.preempted_req_ids and has_kv_transfer_group(): get_kv_transfer_group().handle_preemptions( @@ -3498,9 +3502,11 @@ def propose_draft_token_ids(sampled_token_ids): self.model_config.enable_return_routed_experts and get_tensor_model_parallel_rank() == 0 ): - RoutedExpertsCapturer.get_instance().save_captured_experts( - indices=self.slot_mapping - ) + capturer = RoutedExpertsCapturer.get_instance() + if capturer is not None: + capturer.save_captured_experts(indices=self.slot_mapping) # noqa + else: + logger.error("RoutedExpertsCapturer not initialized.") output = ModelRunnerOutput( req_ids=req_ids_output_copy, From b4f321e3a7311d1ae0e4918bc1bbe4c2646d1f52 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Tue, 16 Dec 2025 11:32:01 +0000 Subject: [PATCH 15/33] Fix NCCL incompatibility by switching routed experts dtype to int32 Signed-off-by: xhx1022 <1737006628@qq.com> --- .../layers/fused_moe/routed_experts_capturer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index dacf11053151..07f987da3db4 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -101,7 +101,7 @@ def init_buffer( model_config.hf_text_config.num_hidden_layers, model_config.hf_text_config.num_experts_per_tok, ), - dtype=torch.int16, + dtype=torch.int32, device="cuda", ) @@ -112,7 +112,7 @@ def init_buffer( model_config.hf_text_config.num_hidden_layers, model_config.hf_text_config.num_experts_per_tok, ) - self.dest_size = int(np.prod(shape)) * np.dtype(np.int16).itemsize + self.dest_size = int(np.prod(shape)) * np.dtype(np.int32).itemsize self.lock_file = f"{LOCK_FILE_PREFIX}_{instance_id}.lock" self.shm_name = f"{BUFFER_PREFIX}_{instance_id}" @@ -147,7 +147,7 @@ def init_buffer( self._shm = shm self._host_buffer_view = np.ndarray( - shape, dtype=np.int16, buffer=self._shm.buf + shape, dtype=np.int32, buffer=self._shm.buf ) # init 0 self._host_buffer_view.fill(0) @@ -294,7 +294,7 @@ def attach_buffer( ) self._host_buffer_view = np.ndarray( - shape, dtype=np.int16, buffer=self._shm.buf + shape, dtype=np.int32, buffer=self._shm.buf ) finally: unlock_file(fp) From 1cad5bcc70c99a92dcf51cffdb00c9fd88e1b412 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Sun, 21 Dec 2025 13:00:41 +0000 Subject: [PATCH 16/33] format Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/v1/engine/output_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 80bffa756e40..f077c7a47649 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -8,9 +8,9 @@ from typing import Any, cast import numpy as np -from vllm.lora.request import LoRARequest import torch +from vllm.lora.request import LoRARequest from vllm.outputs import ( CompletionOutput, PoolingOutput, From 322d7ceac634decdc45b71adb7c8e049b1769e81 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Fri, 26 Dec 2025 03:32:58 +0000 Subject: [PATCH 17/33] add ack Signed-off-by: xhx1022 <1737006628@qq.com> --- .../model_executor/layers/fused_moe/routed_experts_capturer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 07f987da3db4..04990551e5f1 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from +# https://github.com/zhuyijie88/sglang/commit/85fdb8086a76bf42880d06cd6e34ee67e4517a39/python/sglang/srt/layers/moe/routed_experts_capturer.py + import fcntl import logging from abc import ABC, abstractmethod From 80bcce90a72afd4cb623050833688eb374dbf207 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Fri, 26 Dec 2025 03:40:27 +0000 Subject: [PATCH 18/33] update ack Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/model_executor/layers/fused_moe/routed_experts_capturer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 04990551e5f1..8e7b8e578275 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from -# https://github.com/zhuyijie88/sglang/commit/85fdb8086a76bf42880d06cd6e34ee67e4517a39/python/sglang/srt/layers/moe/routed_experts_capturer.py +# https://github.com/sgl-project/sglang/blob/bed301a5acaa9577c9aa706468bdf242f6a43051/python/sglang/srt/layers/moe/routed_experts_capturer.py import fcntl import logging From 7406227bc0b0604e575e250f7e4805d19a71f705 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Fri, 26 Dec 2025 07:40:18 +0000 Subject: [PATCH 19/33] Compatible with MTP Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/model_executor/layers/fused_moe/routed_experts_capturer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 8e7b8e578275..bdb3c13bb974 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -170,6 +170,8 @@ def capture(self, layer_id: int, topk_ids: torch.Tensor) -> None: if self._experts_capturer_device_buffer is None: raise RuntimeError("Buffer not initialized.") batch_size, num_routed_experts = topk_ids.shape + if layer_id >= self._experts_capturer_device_buffer.shape[1]: + return # copy into device buffer (ensure shapes are compatible) self._experts_capturer_device_buffer[:batch_size, layer_id, :] = topk_ids From fd46e9a8f5ad6e96f2a0ac02f7a85c6c8a938347 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Tue, 30 Dec 2025 19:28:22 +0800 Subject: [PATCH 20/33] fixed some bug Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 7bb395fe5bd0..8018d8c6dcb7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -237,7 +237,7 @@ def __init__( ) if ":" in self.vllm_config.instance_id: # for async mode in verl - self.instance_id = self.vllm_config.instance_id.rsplit(":", 1)[-1] + self.instance_id = self.vllm_config.instance_id else: # sync mode in verl rank = self.vllm_config.parallel_config.rank world_size = self.vllm_config.parallel_config.world_size diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 63940e700aaa..1795441256ac 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5686,7 +5686,7 @@ def init_routed_experts_capturer(self): ) * block_size if ":" in self.vllm_config.instance_id: # for async mode in verl - self.instance_id = self.vllm_config.instance_id.rsplit(":", 1)[-1] + self.instance_id = self.vllm_config.instance_id else: # sync mode in verl rank = self.vllm_config.parallel_config.rank world_size = self.vllm_config.parallel_config.world_size From bb059c421e146cc2e423521dd6f6fd0f1793366d Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Tue, 6 Jan 2026 14:47:15 +0800 Subject: [PATCH 21/33] update code Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/v1/core/sched/scheduler.py | 7 +------ vllm/v1/worker/gpu_model_runner.py | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 8018d8c6dcb7..c37d70cab06c 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -236,12 +236,7 @@ def __init__( enable=self.vllm_config.model_config.enable_return_routed_experts ) - if ":" in self.vllm_config.instance_id: # for async mode in verl - self.instance_id = self.vllm_config.instance_id - else: # sync mode in verl - rank = self.vllm_config.parallel_config.rank - world_size = self.vllm_config.parallel_config.world_size - self.instance_id = f"rank_{rank // world_size}" + self.instance_id = self.vllm_config.instance_id self.routed_experts_reader.attach_buffer( max_num_kv_tokens=self.max_num_kv_tokens, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1795441256ac..669308772787 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5685,12 +5685,7 @@ def init_routed_experts_capturer(self): + 1 ) * block_size - if ":" in self.vllm_config.instance_id: # for async mode in verl - self.instance_id = self.vllm_config.instance_id - else: # sync mode in verl - rank = self.vllm_config.parallel_config.rank - world_size = self.vllm_config.parallel_config.world_size - self.instance_id = f"rank_{rank // world_size}" + self.instance_id = self.vllm_config.instance_id routed_experts_capturer.init_buffer( max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens, From 45da3adb5a45c48ae85971a5526cb74377071690 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Wed, 7 Jan 2026 10:11:10 +0800 Subject: [PATCH 22/33] update code Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/v1/engine/output_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index f077c7a47649..b395ea21e28a 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -268,7 +268,7 @@ def make_request_output( external_req_id = self.parent_req.external_req_id return self._new_request_output( - external_req_id, outputs, finished, kv_transfer_params,routed_experts + external_req_id, outputs, finished, kv_transfer_params, routed_experts ) def _new_request_output( From e4ee3ecfa8500c276be3ba301a1b428acb16ee75 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Wed, 7 Jan 2026 10:52:38 +0800 Subject: [PATCH 23/33] update code Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/v1/engine/output_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index b395ea21e28a..f461e56fff07 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -268,7 +268,7 @@ def make_request_output( external_req_id = self.parent_req.external_req_id return self._new_request_output( - external_req_id, outputs, finished, kv_transfer_params, routed_experts + external_req_id, outputs, finished, kv_transfer_params ) def _new_request_output( From d675090bfbaffe1a3dc974cdadcc098a2e6d21a5 Mon Sep 17 00:00:00 2001 From: Hongxin Xu <70438206+xhx1022@users.noreply.github.com> Date: Fri, 9 Jan 2026 09:55:45 +0800 Subject: [PATCH 24/33] Update vllm/model_executor/layers/fused_moe/layer.py Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> Signed-off-by: Hongxin Xu <70438206+xhx1022@users.noreply.github.com> --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 72924ee276b6..894787498993 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -710,7 +710,7 @@ def shared_experts(self) -> torch.nn.Module | None: return None @property - def get_layer_id(self): + def layer_id(self): return self.layer_id @property From fd3ed0570f4426e25ba05163477799aa6880faa7 Mon Sep 17 00:00:00 2001 From: Hongxin Xu <70438206+xhx1022@users.noreply.github.com> Date: Fri, 9 Jan 2026 09:57:13 +0800 Subject: [PATCH 25/33] Update vllm/model_executor/layers/fused_moe/routed_experts_capturer.py Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> Signed-off-by: Hongxin Xu <70438206+xhx1022@users.noreply.github.com> --- .../model_executor/layers/fused_moe/routed_experts_capturer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index bdb3c13bb974..2f037d41c76a 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -17,7 +17,8 @@ logger = logging.getLogger(__name__) -LOCK_FILE_PREFIX = "/tmp/vllm_routed_experts" # Shared lock file path +tmp_dir = tempfile.gettempdir() +LOCK_FILE_PREFIX = os.path.join(tmp_dir, "vllm_routed_experts") # Shared lock file path BUFFER_PREFIX = "vllm_routed_experts_buffer" From d3b0d20218496f1501fb52ea081c5c92def2566a Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Fri, 9 Jan 2026 10:26:46 +0800 Subject: [PATCH 26/33] update code Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/model_executor/layers/fused_moe/layer.py | 10 ++++------ .../layers/fused_moe/routed_experts_capturer.py | 2 ++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 894787498993..bfe5ea6171bd 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -439,11 +439,6 @@ def __init__( compilation_config.static_forward_context[prefix] = self self.layer_name = prefix - # Delayed import to avoid circular dependency - from vllm.model_executor.models.utils import extract_layer_index - - self.layer_id = extract_layer_index(self.layer_name) - self.enable_eplb = enable_eplb self.expert_load_view: torch.Tensor | None = None self.logical_to_physical_map: torch.Tensor | None = None @@ -711,7 +706,10 @@ def shared_experts(self) -> torch.nn.Module | None: @property def layer_id(self): - return self.layer_id + # Delayed import to avoid circular dependency + from vllm.model_executor.models.utils import extract_layer_index + + return extract_layer_index(self.layer_name) @property def gate(self) -> torch.nn.Module | None: diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 2f037d41c76a..d916caa572af 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -5,6 +5,8 @@ import fcntl import logging +import os +import tempfile from abc import ABC, abstractmethod from multiprocessing import shared_memory from typing import Optional From b5054647e4e7dcce435ab1f7da562f686599ca20 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Sat, 10 Jan 2026 18:05:32 +0800 Subject: [PATCH 27/33] fixed slot_mapping bug Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/v1/worker/gpu_model_runner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 669308772787..1f26a1650838 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1629,9 +1629,6 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded) slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded] - if self.model_config.enable_return_routed_experts and kv_cache_gid == 0: - self.slot_mapping = slot_mapping.cpu().numpy() - # Fill unused with -1. Needed for reshape_and_cache in full cuda # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID slot_mapping[num_tokens:num_tokens_padded].fill_(-1) @@ -1640,6 +1637,8 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): return blk_table_tensor, slot_mapping block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0) + if self.model_config.enable_return_routed_experts: + self.slot_mapping = slot_mapping_gid_0[:num_tokens].cpu().numpy() cm_base = CommonAttentionMetadata( query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1], query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1], From e52228c1c417236817a814c6eaffe1c94778ba19 Mon Sep 17 00:00:00 2001 From: arlenxu Date: Mon, 12 Jan 2026 14:17:14 +0800 Subject: [PATCH 28/33] fixed bug Signed-off-by: arlenxu --- vllm/config/vllm.py | 6 ++++ vllm/model_executor/layers/fused_moe/layer.py | 13 ++++---- vllm/v1/core/sched/scheduler.py | 33 +++++++++++-------- vllm/v1/worker/gpu_model_runner.py | 18 +++++----- 4 files changed, 42 insertions(+), 28 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 63bfd056bf31..e5d3ece553a2 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -630,6 +630,12 @@ def __post_init__(self): scope="local", ) self.scheduler_config.async_scheduling = False + elif self.model_config.enable_return_routed_experts: + logger.warning( + "Async scheduling will be disabled because it is not supported " + "with enable_return_routed_experts=True." + ) + self.scheduler_config.async_scheduling = False else: self.scheduler_config.async_scheduling = True diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index bfe5ea6171bd..a47c58fb489d 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1660,12 +1660,13 @@ def valid_grouping() -> bool: assert topk_ids.dtype == indices_type or indices_type is None - capturer = RoutedExpertsCapturer.get_instance() - if capturer is not None: - capturer.capture( # noqa - layer_id=self.layer_id, - topk_ids=topk_ids, - ) + if self.model_config.enable_return_routed_experts: + capturer = RoutedExpertsCapturer.get_instance() + if capturer is not None: + capturer.capture( # noqa + layer_id=self.layer_id, + topk_ids=topk_ids, + ) return topk_weights, topk_ids diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c37d70cab06c..259b8c4d1f1c 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -228,21 +228,28 @@ def __init__( if self.log_stats and vllm_config.observability_config.enable_mfu_metrics: self.perf_metrics = ModelMetrics(vllm_config) - self.max_num_kv_tokens = ( - kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups) + 1 - ) * self.block_size - - self.routed_experts_reader = RoutedExpertsReader.create( - enable=self.vllm_config.model_config.enable_return_routed_experts - ) + if self.model_config.enable_return_routed_experts: + assert self.dcp_world_size == 1 and self.pcp_world_size == 1, ( + "enable_return_routed_experts does not support context parallelism " + "(dcp_world_size > 1 or pcp_world_size > 1)" + ) - self.instance_id = self.vllm_config.instance_id + self.routed_experts_reader = RoutedExpertsReader.create( + enable=self.vllm_config.model_config.enable_return_routed_experts + ) - self.routed_experts_reader.attach_buffer( - max_num_kv_tokens=self.max_num_kv_tokens, - model_config=self.vllm_config.model_config, - instance_id=self.instance_id, - ) + assert len(kv_cache_config.kv_cache_groups) > 0, ( + "enable_return_routed_experts requires at least one kv cache group" + ) + self.max_num_kv_tokens = ( + kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups) + 1 + ) * self.block_size + + self.routed_experts_reader.attach_buffer( + max_num_kv_tokens=self.max_num_kv_tokens, + model_config=self.vllm_config.model_config, + instance_id=self.vllm_config.instance_id, + ) def schedule(self) -> SchedulerOutput: # NOTE(woosuk) on the scheduling algorithm: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1f26a1650838..3f4103871c05 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3118,11 +3118,12 @@ def execute_model( "after execute_model() returns None." ) - capturer = RoutedExpertsCapturer.get_instance() - if capturer is not None: - capturer.clear_buffer() # noqa - else: - logger.error("RoutedExpertsCapturer not initialized.") + if self.vllm_config.model_config.enable_return_routed_experts: + capturer = RoutedExpertsCapturer.get_instance() + if capturer is not None: + capturer.clear_buffer() # noqa + else: + logger.error("RoutedExpertsCapturer not initialized.") if scheduler_output.preempted_req_ids and has_kv_transfer_group(): get_kv_transfer_group().handle_preemptions( @@ -5668,7 +5669,8 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: kv_transfer_group.register_kv_caches(kv_caches) kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks) - self.init_routed_experts_capturer() + if self.model_config.enable_return_routed_experts: + self.init_routed_experts_capturer() def init_routed_experts_capturer(self): logger.info( @@ -5684,13 +5686,11 @@ def init_routed_experts_capturer(self): + 1 ) * block_size - self.instance_id = self.vllm_config.instance_id - routed_experts_capturer.init_buffer( max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens, max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.model_config, - instance_id=self.instance_id, + instance_id=self.vllm_config.instance_id, enable_shared_memory=get_tensor_model_parallel_rank() == 0, ) From 5dadff120e9677da9c181f2c5579e4f265b9bbfe Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Mon, 12 Jan 2026 14:31:32 +0800 Subject: [PATCH 29/33] fixed bug Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/v1/core/sched/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 259b8c4d1f1c..563efe854f1f 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -228,7 +228,7 @@ def __init__( if self.log_stats and vllm_config.observability_config.enable_mfu_metrics: self.perf_metrics = ModelMetrics(vllm_config) - if self.model_config.enable_return_routed_experts: + if self.vllm_config.model_config.enable_return_routed_experts: assert self.dcp_world_size == 1 and self.pcp_world_size == 1, ( "enable_return_routed_experts does not support context parallelism " "(dcp_world_size > 1 or pcp_world_size > 1)" From 0f5e95f85899d8df54baa77ebfde352422c81466 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Mon, 12 Jan 2026 16:28:22 +0800 Subject: [PATCH 30/33] Refactor Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- .../fused_moe/routed_experts_capturer.py | 484 +++++++++--------- vllm/v1/core/sched/scheduler.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 4 +- 4 files changed, 237 insertions(+), 257 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index a47c58fb489d..fbce5953f43b 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1660,7 +1660,7 @@ def valid_grouping() -> bool: assert topk_ids.dtype == indices_type or indices_type is None - if self.model_config.enable_return_routed_experts: + if self.vllm_config.model_config.enable_return_routed_experts: capturer = RoutedExpertsCapturer.get_instance() if capturer is not None: capturer.capture( # noqa diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index d916caa572af..6c65ba408254 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -3,61 +3,110 @@ # Adapted from # https://github.com/sgl-project/sglang/blob/bed301a5acaa9577c9aa706468bdf242f6a43051/python/sglang/srt/layers/moe/routed_experts_capturer.py +from __future__ import annotations + import fcntl import logging import os import tempfile -from abc import ABC, abstractmethod +from collections.abc import Generator +from contextlib import contextmanager from multiprocessing import shared_memory -from typing import Optional +from typing import TYPE_CHECKING from unittest.mock import patch import numpy as np import torch -from vllm.config import ModelConfig +if TYPE_CHECKING: + from vllm.config import ModelConfig logger = logging.getLogger(__name__) -tmp_dir = tempfile.gettempdir() -LOCK_FILE_PREFIX = os.path.join(tmp_dir, "vllm_routed_experts") # Shared lock file path -BUFFER_PREFIX = "vllm_routed_experts_buffer" +# Constants +_TMP_DIR = tempfile.gettempdir() +_LOCK_FILE_PREFIX = os.path.join(_TMP_DIR, "vllm_routed_experts") +_BUFFER_PREFIX = "vllm_routed_experts_buffer" +# Global singleton instances +_global_experts_capturer: RoutedExpertsCapturer | None = None +_global_experts_reader: RoutedExpertsReader | None = None -def lock_file(fp): - fcntl.flock(fp, fcntl.LOCK_EX) +@contextmanager +def _file_lock(lock_file: str, mode: str = "wb+") -> Generator[None, None, None]: + """Context manager for file-based locking.""" + with open(lock_file, mode) as fp: + fcntl.flock(fp, fcntl.LOCK_EX) + try: + yield + finally: + fcntl.flock(fp, fcntl.LOCK_UN) -def unlock_file(fp): - fcntl.flock(fp, fcntl.LOCK_UN) +def _create_or_attach_shared_memory( + name: str, size: int, lock_file: str +) -> shared_memory.SharedMemory: + """Create or attach to shared memory with proper locking.""" + # Ensure lock file exists before acquiring lock + with open(lock_file, "wb"): + pass -# Global singleton instances (annotated) -_global_experts_capturer: Optional["RoutedExpertsCapturer"] = None -_global_experts_reader: Optional["RoutedExpertsReader"] = None + with _file_lock(lock_file): + try: + shm = shared_memory.SharedMemory(name=name, create=True, size=size) + except FileExistsError: + shm = shared_memory.SharedMemory(name=name, create=False, size=size) + + if shm.size != size: + logger.warning( + "Shared memory %s size mismatch; recreating", + name, + ) + shm.close() + shm.unlink() + try: + shm = shared_memory.SharedMemory(name=name, create=True, size=size) + logger.info("Created shared memory %s", name) + except FileExistsError: + shm = shared_memory.SharedMemory(name=name, create=False, size=size) + logger.info("Linked to existing shared memory %s", name) + return shm -class RoutedExpertsCapturer(ABC): - """Abstract interface for capturer (host side).""" - @staticmethod - def create(enable: bool) -> "RoutedExpertsCapturer": - """Create a global singleton instance""" +class RoutedExpertsCapturer: + """ + Capturer for routed experts with device and optional shared memory buffer. + + This class captures expert routing decisions during model forward passes + and optionally stores them in shared memory for cross-process access. + """ + + _instance: RoutedExpertsCapturer | None = None + + def __init__(self) -> None: + self._device_buffer: torch.Tensor | None = None + self._shm: shared_memory.SharedMemory | None = None + self._host_buffer_view: np.ndarray | None = None + self._lock_file: str | None = None + self._shm_name: str | None = None + + @classmethod + def create(cls) -> RoutedExpertsCapturer: + """Create a global singleton instance.""" global _global_experts_capturer if _global_experts_capturer is not None: raise RuntimeError("Experts capturer already created.") - if enable: - _global_experts_capturer = _RoutedExpertsCapturerReal() - else: - _global_experts_capturer = _RoutedExpertsCapturerNoop() + _global_experts_capturer = cls() return _global_experts_capturer @staticmethod - def get_instance() -> Optional["RoutedExpertsCapturer"]: + def get_instance() -> RoutedExpertsCapturer | None: + """Get the global singleton instance.""" return _global_experts_capturer - @abstractmethod def init_buffer( self, max_num_batched_tokens: int, @@ -66,276 +115,211 @@ def init_buffer( instance_id: str, enable_shared_memory: bool, ) -> None: - raise NotImplementedError + """ + Initialize the device buffer and optionally shared memory buffer. + + Args: + max_num_batched_tokens: Maximum number of tokens in a batch. + max_num_kv_tokens: Maximum number of KV tokens for shared memory. + model_config: Model configuration containing layer and expert info. + instance_id: Unique identifier for the shared memory buffer. + enable_shared_memory: Whether to enable shared memory for IPC. + """ - @abstractmethod - def capture(self, layer_id: int, topk_ids: torch.Tensor) -> None: - raise NotImplementedError + if self._device_buffer is not None: + raise RuntimeError("Device buffer has already been initialized") - @abstractmethod - def clear_buffer(self) -> None: - raise NotImplementedError + hf_config = model_config.hf_text_config + num_layers = hf_config.num_hidden_layers + num_experts_per_tok = hf_config.num_experts_per_tok - @abstractmethod - def save_captured_experts(self, indices: np.ndarray) -> None: - raise NotImplementedError + # Initialize device buffer + self._device_buffer = torch.zeros( + (max_num_batched_tokens, num_layers, num_experts_per_tok), + dtype=torch.int32, + device="cuda", + ) + if not enable_shared_memory: + return -class _RoutedExpertsCapturerReal(RoutedExpertsCapturer): - """Capturer for routed experts with host buffer""" + # Initialize shared memory + shape = (max_num_kv_tokens, num_layers, num_experts_per_tok) + buffer_size = int(np.prod(shape)) * np.dtype(np.int32).itemsize - def __init__(self) -> None: - self._experts_capturer_device_buffer: torch.Tensor | None = None - self._shm: shared_memory.SharedMemory | None = None - self._host_buffer_view: np.ndarray | None = None + self._lock_file = f"{_LOCK_FILE_PREFIX}_{instance_id}.lock" + self._shm_name = f"{_BUFFER_PREFIX}_{instance_id}" - def init_buffer( - self, - max_num_batched_tokens: int, - max_num_kv_tokens: int, - model_config: ModelConfig, - instance_id: str, - enable_shared_memory: bool, - ) -> None: - if ( - model_config.enable_return_routed_experts - and self._experts_capturer_device_buffer is None - ): - self._experts_capturer_device_buffer = torch.zeros( - ( - max_num_batched_tokens, - model_config.hf_text_config.num_hidden_layers, - model_config.hf_text_config.num_experts_per_tok, - ), - dtype=torch.int32, - device="cuda", - ) + self._shm = _create_or_attach_shared_memory( + self._shm_name, buffer_size, self._lock_file + ) + self._host_buffer_view = np.ndarray(shape, dtype=np.int32, buffer=self._shm.buf) + self._host_buffer_view.fill(0) - if enable_shared_memory: - # Compute required shared memory size - shape = ( - max_num_kv_tokens, - model_config.hf_text_config.num_hidden_layers, - model_config.hf_text_config.num_experts_per_tok, - ) - self.dest_size = int(np.prod(shape)) * np.dtype(np.int32).itemsize - self.lock_file = f"{LOCK_FILE_PREFIX}_{instance_id}.lock" - self.shm_name = f"{BUFFER_PREFIX}_{instance_id}" - - with open(self.lock_file, "wb") as fp: - lock_file(fp) - try: - shm = shared_memory.SharedMemory( - name=self.shm_name, create=True, size=self.dest_size - ) - except FileExistsError: - shm = shared_memory.SharedMemory( - name=self.shm_name, create=False, size=self.dest_size - ) - - if shm.size != self.dest_size: - logger.warning( - "Shared memory %s size mismatch; recreate", - self.shm_name, - ) - shm.close() - shm.unlink() - try: - shm = shared_memory.SharedMemory( - name=self.shm_name, create=True, size=self.dest_size - ) - logger.info("Create shared memory %s", self.shm_name) - except FileExistsError: - shm = shared_memory.SharedMemory( - name=self.shm_name, create=False, size=self.dest_size - ) - logger.info("Link shared memory %s", self.shm_name) - - self._shm = shm - self._host_buffer_view = np.ndarray( - shape, dtype=np.int32, buffer=self._shm.buf - ) - # init 0 - self._host_buffer_view.fill(0) - unlock_file(fp) - - # parameterized logging (avoid f-strings in logging) - logger.debug( - "Created shared memory buffer '%s' with shape %s", - self._shm.name if self._shm is not None else "None", - shape, - ) - else: - self._shm = None - self._host_buffer_view = None + logger.debug( + "Created shared memory buffer '%s' with shape %s", + self._shm.name, + shape, + ) def capture(self, layer_id: int, topk_ids: torch.Tensor) -> None: - if self._experts_capturer_device_buffer is None: - raise RuntimeError("Buffer not initialized.") - batch_size, num_routed_experts = topk_ids.shape - if layer_id >= self._experts_capturer_device_buffer.shape[1]: + """ + Capture expert routing decisions for a specific layer. + + Args: + layer_id: The layer index. + topk_ids: Tensor of shape (batch_size, num_routed_experts). + """ + if self._device_buffer is None: + raise RuntimeError("Buffer not initialized. Call init_buffer() first.") + + if layer_id >= self._device_buffer.shape[1]: return - # copy into device buffer (ensure shapes are compatible) - self._experts_capturer_device_buffer[:batch_size, layer_id, :] = topk_ids + + batch_size = topk_ids.shape[0] + self._device_buffer[:batch_size, layer_id, :] = topk_ids def clear_buffer(self) -> None: - if self._experts_capturer_device_buffer is not None: - self._experts_capturer_device_buffer.zero_() + """Clear the device buffer.""" + if self._device_buffer is not None: + self._device_buffer.zero_() def save_captured_experts(self, indices: np.ndarray) -> None: - # Copy the entire batch from GPU to shared memory (via numpy view) - with open(self.lock_file, "wb+") as fp: - lock_file(fp) - try: - if self._host_buffer_view is not None: - num_tokens = len(indices) - # Ensure device buffer exists - if self._experts_capturer_device_buffer is None: - raise RuntimeError("Device buffer not initialized.") - data = ( - self._experts_capturer_device_buffer[:num_tokens, :, :] - .cpu() - .numpy() - ) - # indices should be valid for host buffer - self._host_buffer_view[indices, :, :] = data - finally: - unlock_file(fp) + """ + Save captured experts from device buffer to shared memory. - def __del__(self) -> None: - """Clean up shared memory""" - try: - if self._shm is not None: + Args: + indices: Array of indices indicating where to store the data. + """ + if self._lock_file is None: + raise RuntimeError("Shared memory not initialized.") + if self._host_buffer_view is None: + return + if self._device_buffer is None: + raise RuntimeError("Device buffer not initialized.") + + num_tokens = len(indices) + data = self._device_buffer[:num_tokens, :, :].cpu().numpy() + + with _file_lock(self._lock_file): + self._host_buffer_view[indices, :, :] = data + + def cleanup(self) -> None: + """Explicitly clean up shared memory resources.""" + if self._shm is not None: + try: self._shm.close() - # Only creator should unlink self._shm.unlink() - except Exception: - # Avoid raising in destructor - logger.debug("Exception during __del__ cleanup for capturer", exc_info=True) - + except Exception: + logger.debug("Exception during cleanup for capturer", exc_info=True) + finally: + self._shm = None -class _RoutedExpertsCapturerNoop(RoutedExpertsCapturer): - def init_buffer( - self, - max_num_batched_tokens: int, - max_num_kv_tokens: int, - model_config: ModelConfig, - instance_id: str, - enable_shared_memory: bool, - ) -> None: - return None + def __del__(self) -> None: + """Clean up shared memory on destruction.""" + self.cleanup() - def capture(self, layer_id: int, topk_ids: torch.Tensor) -> None: - return None - def clear_buffer(self) -> None: - return None +class RoutedExpertsReader: + """ + Reader for routed experts from shared memory. - def save_captured_experts(self, indices: np.ndarray) -> None: - return None + This class attaches to shared memory created by RoutedExpertsCapturer + and reads expert routing decisions. + """ + _instance: RoutedExpertsReader | None = None -class RoutedExpertsReader(ABC): - """Abstract interface for reader (worker side).""" + def __init__(self) -> None: + self._shm: shared_memory.SharedMemory | None = None + self._host_buffer_view: np.ndarray | None = None + self._lock_file: str | None = None - @staticmethod - def create(enable: bool) -> "RoutedExpertsReader": - """Create a global singleton instance""" + @classmethod + def create(cls) -> RoutedExpertsReader: + """Create a global singleton instance.""" global _global_experts_reader if _global_experts_reader is not None: - raise RuntimeError("Experts Reader already created.") + raise RuntimeError("Experts reader already created.") - if enable: - _global_experts_reader = _RoutedExpertsReaderReal() - else: - _global_experts_reader = _RoutedExpertsReaderNoop() + _global_experts_reader = cls() return _global_experts_reader @staticmethod - def get_instance() -> Optional["RoutedExpertsReader"]: + def get_instance() -> RoutedExpertsReader | None: + """Get the global singleton instance.""" if _global_experts_reader is None: logger.info("Experts reader not initialized.") return _global_experts_reader - @abstractmethod def attach_buffer( - self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str + self, + max_num_kv_tokens: int, + model_config: ModelConfig, + instance_id: str, ) -> None: - raise NotImplementedError - - @abstractmethod - def get_routed_experts(self, indices: np.ndarray) -> np.ndarray | None: - raise NotImplementedError - - -class _RoutedExpertsReaderReal(RoutedExpertsReader): - """Reader class in worker process""" - - def __init__(self) -> None: - self._shm: shared_memory.SharedMemory | None = None - self._host_buffer_view: np.ndarray | None = None + """ + Attach to an existing shared memory buffer. - def attach_buffer( - self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str - ) -> None: - if self._shm is None: - shape = ( - max_num_kv_tokens, - model_config.hf_text_config.num_hidden_layers, - model_config.hf_text_config.num_experts_per_tok, + Args: + max_num_kv_tokens: Maximum number of KV tokens. + model_config: Model configuration. + instance_id: Unique identifier for the shared memory buffer. + """ + if self._shm is not None: + logger.warning("Already attached to shared memory buffer.") + return # Already attached + + hf_config = model_config.hf_text_config + shape = ( + max_num_kv_tokens, + hf_config.num_hidden_layers, + hf_config.num_experts_per_tok, + ) + + self._lock_file = f"{_LOCK_FILE_PREFIX}_{instance_id}.lock" + shm_name = f"{_BUFFER_PREFIX}_{instance_id}" + + with _file_lock(self._lock_file, mode="rb+"): + # Avoid resource_tracker registering the shared memory + with patch( + "multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None, + ): + self._shm = shared_memory.SharedMemory(name=shm_name) + + self._host_buffer_view = np.ndarray( + shape, dtype=np.int32, buffer=self._shm.buf ) - self.lock_file = f"{LOCK_FILE_PREFIX}_{instance_id}.lock" - - # Attach to existing shared memory - with open(self.lock_file, "rb+") as fp: - lock_file(fp) - try: - # avoid resource_tracker registering the shared memory - with patch( - "multiprocessing.resource_tracker.register", - lambda *args, **kwargs: None, - ): - # This will raise if the shared memory doesn't exist - self._shm = shared_memory.SharedMemory( - name=f"{BUFFER_PREFIX}_{instance_id}" - ) - - self._host_buffer_view = np.ndarray( - shape, dtype=np.int32, buffer=self._shm.buf - ) - finally: - unlock_file(fp) - - def get_routed_experts(self, indices: np.ndarray) -> np.ndarray | None: + def get_routed_experts(self, indices: np.ndarray) -> np.ndarray: """ - Read routed expert data from shared memory for the given request. + Read routed expert data from shared memory. + + Args: + indices: Array of indices to read. + + Returns: + Copy of the expert routing data for the given indices. """ + if self._host_buffer_view is None: + raise RuntimeError("Buffer not attached. Call attach_buffer() first.") + if self._lock_file is None: + raise RuntimeError("Lock file not initialized.") + + with _file_lock(self._lock_file, mode="rb+"): + return self._host_buffer_view[indices, :, :].copy() - with open(self.lock_file, "rb+") as fp: - lock_file(fp) + def cleanup(self) -> None: + """Explicitly clean up resources (close without unlink).""" + if self._shm is not None: try: - if self._host_buffer_view is None: - raise RuntimeError("Buffer not attached.") - # Return a copy to avoid referencing shared memory buffer directly - return self._host_buffer_view[indices, :, :].copy() + self._shm.close() + except Exception: + logger.debug("Exception during cleanup for reader", exc_info=True) finally: - unlock_file(fp) + self._shm = None def __del__(self) -> None: - """Only close, do not delete shared memory""" - try: - if self._shm is not None: - self._shm.close() # Note: reader does not call unlink() - except Exception: - logger.debug("Exception during __del__ cleanup for reader", exc_info=True) - - -class _RoutedExpertsReaderNoop(RoutedExpertsReader): - def attach_buffer( - self, max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str - ) -> None: - return None - - def get_routed_experts(self, indices: np.ndarray) -> np.ndarray | None: - return None + """Close shared memory on destruction (do not unlink).""" + self.cleanup() diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 563efe854f1f..bdd6d2a3cfa4 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -234,9 +234,7 @@ def __init__( "(dcp_world_size > 1 or pcp_world_size > 1)" ) - self.routed_experts_reader = RoutedExpertsReader.create( - enable=self.vllm_config.model_config.enable_return_routed_experts - ) + self.routed_experts_reader = RoutedExpertsReader.create() assert len(kv_cache_config.kv_cache_groups) > 0, ( "enable_return_routed_experts requires at least one kv cache group" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3f4103871c05..7569f2d6afe4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5677,9 +5677,7 @@ def init_routed_experts_capturer(self): "Initializing routed experts capturer, enable_return_routed_experts: %s", self.model_config.enable_return_routed_experts, ) - routed_experts_capturer = RoutedExpertsCapturer.create( - self.model_config.enable_return_routed_experts - ) + routed_experts_capturer = RoutedExpertsCapturer.create() block_size = self.cache_config.block_size self.max_num_kv_tokens = ( self.kv_cache_config.num_blocks // len(self.kv_cache_config.kv_cache_groups) From bdf799bb93d472c13a5c734f9c4535504294bef2 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Mon, 12 Jan 2026 16:49:00 +0800 Subject: [PATCH 31/33] Refactor Signed-off-by: xhx1022 <1737006628@qq.com> --- .../layers/fused_moe/routed_experts_capturer.py | 11 +++++------ vllm/v1/worker/gpu_model_runner.py | 7 +------ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 6c65ba408254..0fd788ea571e 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -12,14 +12,13 @@ from collections.abc import Generator from contextlib import contextmanager from multiprocessing import shared_memory -from typing import TYPE_CHECKING from unittest.mock import patch import numpy as np import torch -if TYPE_CHECKING: - from vllm.config import ModelConfig +from vllm.config import ModelConfig +from vllm.distributed import get_tensor_model_parallel_rank logger = logging.getLogger(__name__) @@ -113,7 +112,6 @@ def init_buffer( max_num_kv_tokens: int, model_config: ModelConfig, instance_id: str, - enable_shared_memory: bool, ) -> None: """ Initialize the device buffer and optionally shared memory buffer. @@ -123,7 +121,6 @@ def init_buffer( max_num_kv_tokens: Maximum number of KV tokens for shared memory. model_config: Model configuration containing layer and expert info. instance_id: Unique identifier for the shared memory buffer. - enable_shared_memory: Whether to enable shared memory for IPC. """ if self._device_buffer is not None: @@ -140,7 +137,7 @@ def init_buffer( device="cuda", ) - if not enable_shared_memory: + if get_tensor_model_parallel_rank() != 0: return # Initialize shared memory @@ -191,6 +188,8 @@ def save_captured_experts(self, indices: np.ndarray) -> None: Args: indices: Array of indices indicating where to store the data. """ + if get_tensor_model_parallel_rank() != 0: + return if self._lock_file is None: raise RuntimeError("Shared memory not initialized.") if self._host_buffer_view is None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7569f2d6afe4..e08463c40dc6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -31,7 +31,6 @@ get_layers_from_vllm_config, update_config, ) -from vllm.distributed import get_tensor_model_parallel_rank from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer from vllm.distributed.eplb.eplb_state import EplbState from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group @@ -3498,10 +3497,7 @@ def propose_draft_token_ids(sampled_token_ids): self.eplb_step() with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"): - if ( - self.model_config.enable_return_routed_experts - and get_tensor_model_parallel_rank() == 0 - ): + if self.model_config.enable_return_routed_experts: capturer = RoutedExpertsCapturer.get_instance() if capturer is not None: capturer.save_captured_experts(indices=self.slot_mapping) # noqa @@ -5689,7 +5685,6 @@ def init_routed_experts_capturer(self): max_num_kv_tokens=self.max_num_kv_tokens, model_config=self.model_config, instance_id=self.vllm_config.instance_id, - enable_shared_memory=get_tensor_model_parallel_rank() == 0, ) def may_add_encoder_only_layers_to_kv_cache_config(self) -> None: From bc172803b3544e6862b79b3e98c20bf622cc76bc Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Mon, 12 Jan 2026 17:03:30 +0800 Subject: [PATCH 32/33] async_sched Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/config/vllm.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index e5d3ece553a2..63bfd056bf31 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -630,12 +630,6 @@ def __post_init__(self): scope="local", ) self.scheduler_config.async_scheduling = False - elif self.model_config.enable_return_routed_experts: - logger.warning( - "Async scheduling will be disabled because it is not supported " - "with enable_return_routed_experts=True." - ) - self.scheduler_config.async_scheduling = False else: self.scheduler_config.async_scheduling = True From aeb469eea2a2b110af7d5d0e53bb7a290abba5a5 Mon Sep 17 00:00:00 2001 From: xhx1022 <1737006628@qq.com> Date: Mon, 12 Jan 2026 19:18:36 +0800 Subject: [PATCH 33/33] update code Signed-off-by: xhx1022 <1737006628@qq.com> --- vllm/model_executor/layers/fused_moe/layer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index fbce5953f43b..3b3a789f6d6e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1660,9 +1660,13 @@ def valid_grouping() -> bool: assert topk_ids.dtype == indices_type or indices_type is None - if self.vllm_config.model_config.enable_return_routed_experts: + if ( + self.vllm_config.model_config is not None + and self.vllm_config.model_config.enable_return_routed_experts + ): + # In dummy runs, the capturer is not initialized. capturer = RoutedExpertsCapturer.get_instance() - if capturer is not None: + if capturer is not None: # in dummmy_run may be None capturer.capture( # noqa layer_id=self.layer_id, topk_ids=topk_ids,