Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/kernels/benchmark_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
RoutingMethodType,
_get_config_dtype_str,
)
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
Expand Down
4 changes: 2 additions & 2 deletions docs/design/moe_kernel_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
| ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ |
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.experts.triton_moe.TritonExperts] |
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.experts.fused_batched_moe.BatchedTritonExperts] |
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp4] |
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassBatchedExpertsFp8] |
Expand All @@ -91,7 +91,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],</br>[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],</br>[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],</br>[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] |
| rocm aiter moe | standard | mxfp4,</br>fp8 | G(32),G(128),A,T | silu, gelu,</br>swigluoai | Y | N | `rocm_aiter_fused_experts`,</br>`AiterExperts` |
| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.experts.fused_batched_moe.NaiveBatchedExperts] |

!!! info "Table key"
1. All types: mxfp4, nvfp4, int4, int8, fp8
Expand Down
8 changes: 4 additions & 4 deletions tests/kernels/moe/modular_kernel_tools/mk_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
BatchedDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
NaiveBatchedExperts,
)
from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoDPEPModular,
)
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.quantization.utils.nvfp4_utils import (
cutlass_fp4_supported,
)
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/moe/test_batched_deepgemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
BatchedDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
)
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/moe/test_batched_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from tests.kernels.utils import torch_experts
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
invoke_moe_batched_triton_kernel,
)
from vllm.platforms import current_platform
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/moe/test_block_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
_valid_deep_gemm_shape,
)
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.platforms import current_platform
Expand Down
4 changes: 3 additions & 1 deletion tests/kernels/moe/test_deepep_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
)
from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
)
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
Expand Down
4 changes: 2 additions & 2 deletions tests/kernels/moe/test_deepgemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
FusedMoEQuantDesc,
fp8_w8a8_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
)
Expand Down
8 changes: 4 additions & 4 deletions tests/kernels/moe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
FusedMoEQuantConfig,
RoutingMethodType,
)
from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
TritonExperts,
)
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
NaiveBatchedExperts,
)
from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
TritonExperts,
)
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
)
Expand Down
2 changes: 1 addition & 1 deletion vllm/lora/layers/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
from vllm.distributed.utils import divide
from vllm.lora.layers.base import BaseLayerWithLoRA
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe.experts.lora_context import MoELoRAContext
from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
FusedMoEModularMethod,
)
from vllm.model_executor.layers.fused_moe.lora_context import MoELoRAContext
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoDPEPModular,
Expand Down
12 changes: 6 additions & 6 deletions vllm/model_executor/layers/fused_moe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,15 @@ def get_config() -> dict[str, Any] | None:
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
DeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
)
from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
AiterExperts,
)
from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
TritonExperts,
TritonWNA16Experts,
Expand All @@ -97,9 +103,6 @@ def get_config() -> dict[str, Any] | None:
XPUExpertsFp8,
XPUExpertsMXFp4,
)
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedTritonExperts,
)
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
get_config_file_name,
Expand All @@ -110,9 +113,6 @@ def get_config() -> dict[str, Any] | None:
from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import (
GroupedTopk,
)
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)

__all__ += [
"AiterExperts",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
FusedMoEQuantConfig,
RoutingMethodType,
)
from vllm.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin
from vllm.model_executor.layers.fused_moe.experts.lora_experts_mixin import (
LoRAExpertsMixin,
)
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceNoOP,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import torch

from vllm.model_executor.layers.fused_moe.lora_context import MoELoRAContext
from vllm.model_executor.layers.fused_moe.experts.lora_context import MoELoRAContext


class LoRAExpertsMixin:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
FusedMoEParallelConfig,
FusedMoEQuantConfig,
)
from vllm.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin
from vllm.model_executor.layers.fused_moe.experts.lora_experts_mixin import (
LoRAExpertsMixin,
)
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
batched_moe_align_block_size,
moe_align_block_size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
FusedMoEQuantConfig,
)
from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.experts.fallback import FallbackExperts
from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts
from vllm.platforms import current_platform


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
_valid_deep_gemm,
_valid_deep_gemm_shape,
)
from vllm.model_executor.layers.fused_moe.experts.fallback import FallbackExperts
from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts
from vllm.utils.deep_gemm import (
is_deep_gemm_e8m0_used,
)
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/layers/fused_moe/experts/triton_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
FusedMoEParallelConfig,
FusedMoEQuantConfig,
)
from vllm.model_executor.layers.fused_moe.experts.lora_experts_mixin import (
LoRAExpertsMixin,
)
from vllm.model_executor.layers.fused_moe.fused_moe import (
_prepare_expert_assignment,
invoke_fused_moe_triton_kernel,
invoke_fused_moe_wna16_triton_kernel,
try_get_optimal_moe_config,
)
from vllm.model_executor.layers.fused_moe.lora_experts_mixin import (
LoRAExpertsMixin,
)
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size,
)
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/layers/fused_moe/oracle/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def backend_to_kernel_cls(
return [FlashInferExperts]

elif backend == Fp8MoeBackend.DEEPGEMM:
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)

Expand Down Expand Up @@ -158,7 +158,7 @@ def backend_to_kernel_cls(
return [TritonExperts]

elif backend == Fp8MoeBackend.BATCHED_TRITON:
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
)

Expand All @@ -172,7 +172,7 @@ def backend_to_kernel_cls(
return [AiterExperts]

elif backend == Fp8MoeBackend.VLLM_CUTLASS:
from vllm.model_executor.layers.fused_moe.triton_cutlass_moe import (
from vllm.model_executor.layers.fused_moe.experts.triton_cutlass_moe import (
TritonOrCutlassExperts,
)

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def backend_to_kernel_cls(
return [UnfusedOAITritonExperts]

elif backend == Mxfp4MoeBackend.HUMMING:
from vllm.model_executor.layers.fused_moe.fused_humming_moe import (
from vllm.model_executor.layers.fused_moe.experts.fused_humming_moe import (
BatchedHummingGroupedExperts,
HummingGroupedExperts,
HummingIndexedExperts,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def backend_to_kernel_cls(
return TritonExperts

elif backend == UnquantizedMoeBackend.BATCHED_TRITON:
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
)

Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/layers/quantization/humming.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
)
from humming.utils.weight import quantize_weight

from vllm.model_executor.layers.fused_moe.fused_humming_moe import (
from vllm.model_executor.layers.fused_moe.experts.fused_humming_moe import (
BatchedHummingGroupedExperts,
HummingGroupedExperts,
HummingIndexedExperts,
Expand Down Expand Up @@ -182,7 +182,7 @@ def compressed_tensors_get_config(config: dict[str, Any], key: str):


class HummingConfig(QuantizationConfig):
packed_modules_mapping = {}
packed_modules_mapping: dict[str, list[str]] = {}

def __init__(self, full_config: dict[str, Any] | None = None):
assert_humming_available()
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/warmup/deep_gemm_warmup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
from vllm.model_executor.layers.quantization.online.mxfp8 import Mxfp8OnlineLinearMethod
Expand Down
Loading