diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 65bc38c6c755..4463a23772ee 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -27,10 +27,10 @@
RoutingMethodType,
_get_config_dtype_str,
)
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
+from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 2bbadd8ff813..1a11c6685a45 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -81,7 +81,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
| ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ |
| triton | standard | all1 | G,A,T | silu, gelu,swigluoai,silu_no_mul,gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],[`TritonExperts`][vllm.model_executor.layers.fused_moe.experts.triton_moe.TritonExperts] |
-| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
+| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.experts.fused_batched_moe.BatchedTritonExperts] |
| deep gemm | standard,batched | fp8 | G(128),A,T | silu, gelu | 6 | Y | [`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
| cutlass_fp4 | standard,batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp4] |
| cutlass_fp8 | standard,batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp8],[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassBatchedExpertsFp8] |
@@ -91,7 +91,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
| trtllm | standard | mxfp4,nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] |
| rocm aiter moe | standard | mxfp4,fp8 | G(32),G(128),A,T | silu, gelu,swigluoai | Y | N | `rocm_aiter_fused_experts`,`AiterExperts` |
| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
-| naive batched4 | batched | int8,fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
+| naive batched4 | batched | int8,fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.experts.fused_batched_moe.NaiveBatchedExperts] |
!!! info "Table key"
1. All types: mxfp4, nvfp4, int4, int8, fp8
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 111975cf6bae..5d3572b7caa2 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -15,16 +15,16 @@
BatchedDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
NaiveBatchedExperts,
)
+from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
+ TritonOrDeepGemmExperts,
+)
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoDPEPModular,
)
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
- TritonOrDeepGemmExperts,
-)
from vllm.model_executor.layers.quantization.utils.nvfp4_utils import (
cutlass_fp4_supported,
)
diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py
index 4c8b2d87d61f..ab5672f83203 100644
--- a/tests/kernels/moe/test_batched_deepgemm.py
+++ b/tests/kernels/moe/test_batched_deepgemm.py
@@ -9,7 +9,7 @@
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
BatchedDeepGemmExperts,
)
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
)
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index d78e1947fac0..b9fe8ceafcdd 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -16,7 +16,7 @@
from tests.kernels.utils import torch_experts
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
invoke_moe_batched_triton_kernel,
)
from vllm.platforms import current_platform
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 0181fc252476..d11573bd31d4 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -31,7 +31,7 @@
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
_valid_deep_gemm_shape,
)
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.platforms import current_platform
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index d04c3c99ce77..fd6dda384c11 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -19,7 +19,9 @@
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
)
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
+from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
+ BatchedTritonExperts,
+)
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index fd05759ac3df..9095bbc2c76c 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -25,10 +25,10 @@
FusedMoEQuantDesc,
fp8_w8a8_moe_quant_config,
)
-from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
)
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index de3a456fb449..acb2c21b3896 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -17,13 +17,13 @@
FusedMoEQuantConfig,
RoutingMethodType,
)
-from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
- TritonExperts,
-)
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
NaiveBatchedExperts,
)
+from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
+ TritonExperts,
+)
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 8cb32f07965b..abc2f84debb2 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -10,10 +10,10 @@
from vllm.distributed.utils import divide
from vllm.lora.layers.base import BaseLayerWithLoRA
from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe.experts.lora_context import MoELoRAContext
from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
FusedMoEModularMethod,
)
-from vllm.model_executor.layers.fused_moe.lora_context import MoELoRAContext
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoDPEPModular,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index aeb704e18d86..9938dc0dd53d 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -85,9 +85,15 @@ def get_config() -> dict[str, Any] | None:
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
DeepGemmExperts,
)
+ from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
+ BatchedTritonExperts,
+ )
from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import (
AiterExperts,
)
+ from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
+ TritonOrDeepGemmExperts,
+ )
from vllm.model_executor.layers.fused_moe.experts.triton_moe import (
TritonExperts,
TritonWNA16Experts,
@@ -97,9 +103,6 @@ def get_config() -> dict[str, Any] | None:
XPUExpertsFp8,
XPUExpertsMXFp4,
)
- from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
- BatchedTritonExperts,
- )
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
get_config_file_name,
@@ -110,9 +113,6 @@ def get_config() -> dict[str, Any] | None:
from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import (
GroupedTopk,
)
- from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
- TritonOrDeepGemmExperts,
- )
__all__ += [
"AiterExperts",
diff --git a/vllm/model_executor/layers/fused_moe/fallback.py b/vllm/model_executor/layers/fused_moe/experts/fallback.py
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/fallback.py
rename to vllm/model_executor/layers/fused_moe/experts/fallback.py
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/experts/fused_batched_moe.py
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/fused_batched_moe.py
rename to vllm/model_executor/layers/fused_moe/experts/fused_batched_moe.py
diff --git a/vllm/model_executor/layers/fused_moe/fused_humming_moe.py b/vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/fused_humming_moe.py
rename to vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py
diff --git a/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
index e10514debd08..88ea58a9eca7 100644
--- a/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
@@ -14,7 +14,9 @@
FusedMoEQuantConfig,
RoutingMethodType,
)
-from vllm.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin
+from vllm.model_executor.layers.fused_moe.experts.lora_experts_mixin import (
+ LoRAExpertsMixin,
+)
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceNoOP,
)
diff --git a/vllm/model_executor/layers/fused_moe/lora_context.py b/vllm/model_executor/layers/fused_moe/experts/lora_context.py
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/lora_context.py
rename to vllm/model_executor/layers/fused_moe/experts/lora_context.py
diff --git a/vllm/model_executor/layers/fused_moe/lora_experts_mixin.py b/vllm/model_executor/layers/fused_moe/experts/lora_experts_mixin.py
similarity index 97%
rename from vllm/model_executor/layers/fused_moe/lora_experts_mixin.py
rename to vllm/model_executor/layers/fused_moe/experts/lora_experts_mixin.py
index 10707b91b70e..2a680909d5f6 100644
--- a/vllm/model_executor/layers/fused_moe/lora_experts_mixin.py
+++ b/vllm/model_executor/layers/fused_moe/experts/lora_experts_mixin.py
@@ -3,7 +3,7 @@
import torch
-from vllm.model_executor.layers.fused_moe.lora_context import MoELoRAContext
+from vllm.model_executor.layers.fused_moe.experts.lora_context import MoELoRAContext
class LoRAExpertsMixin:
diff --git a/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py b/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py
index 621b33dd8f30..2e831f0e2fbc 100644
--- a/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py
@@ -17,7 +17,9 @@
FusedMoEParallelConfig,
FusedMoEQuantConfig,
)
-from vllm.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin
+from vllm.model_executor.layers.fused_moe.experts.lora_experts_mixin import (
+ LoRAExpertsMixin,
+)
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
batched_moe_align_block_size,
moe_align_block_size,
diff --git a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/experts/triton_cutlass_moe.py
similarity index 96%
rename from vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py
rename to vllm/model_executor/layers/fused_moe/experts/triton_cutlass_moe.py
index ee3622b4ebe3..1753d7354cba 100644
--- a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/triton_cutlass_moe.py
@@ -11,8 +11,8 @@
FusedMoEQuantConfig,
)
from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.experts.fallback import FallbackExperts
from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
-from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts
from vllm.platforms import current_platform
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/experts/triton_deep_gemm_moe.py
similarity index 97%
rename from vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
rename to vllm/model_executor/layers/fused_moe/experts/triton_deep_gemm_moe.py
index fbcf030a9256..58700d714857 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/triton_deep_gemm_moe.py
@@ -14,8 +14,8 @@
_valid_deep_gemm,
_valid_deep_gemm_shape,
)
+from vllm.model_executor.layers.fused_moe.experts.fallback import FallbackExperts
from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts
-from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts
from vllm.utils.deep_gemm import (
is_deep_gemm_e8m0_used,
)
diff --git a/vllm/model_executor/layers/fused_moe/experts/triton_moe.py b/vllm/model_executor/layers/fused_moe/experts/triton_moe.py
index e99f307ec0ec..b9b046aa17ab 100644
--- a/vllm/model_executor/layers/fused_moe/experts/triton_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/triton_moe.py
@@ -12,15 +12,15 @@
FusedMoEParallelConfig,
FusedMoEQuantConfig,
)
+from vllm.model_executor.layers.fused_moe.experts.lora_experts_mixin import (
+ LoRAExpertsMixin,
+)
from vllm.model_executor.layers.fused_moe.fused_moe import (
_prepare_expert_assignment,
invoke_fused_moe_triton_kernel,
invoke_fused_moe_wna16_triton_kernel,
try_get_optimal_moe_config,
)
-from vllm.model_executor.layers.fused_moe.lora_experts_mixin import (
- LoRAExpertsMixin,
-)
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size,
)
diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
index 1ce5a1e3213b..0fa706d8daaf 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
@@ -130,7 +130,7 @@ def backend_to_kernel_cls(
return [FlashInferExperts]
elif backend == Fp8MoeBackend.DEEPGEMM:
- from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+ from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
@@ -158,7 +158,7 @@ def backend_to_kernel_cls(
return [TritonExperts]
elif backend == Fp8MoeBackend.BATCHED_TRITON:
- from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+ from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
)
@@ -172,7 +172,7 @@ def backend_to_kernel_cls(
return [AiterExperts]
elif backend == Fp8MoeBackend.VLLM_CUTLASS:
- from vllm.model_executor.layers.fused_moe.triton_cutlass_moe import (
+ from vllm.model_executor.layers.fused_moe.experts.triton_cutlass_moe import (
TritonOrCutlassExperts,
)
diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
index 4de0d722d50b..1820e3463621 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
@@ -147,7 +147,7 @@ def backend_to_kernel_cls(
return [UnfusedOAITritonExperts]
elif backend == Mxfp4MoeBackend.HUMMING:
- from vllm.model_executor.layers.fused_moe.fused_humming_moe import (
+ from vllm.model_executor.layers.fused_moe.experts.fused_humming_moe import (
BatchedHummingGroupedExperts,
HummingGroupedExperts,
HummingIndexedExperts,
diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
index f1fd196c09c1..9215c960a8c9 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
@@ -116,7 +116,7 @@ def backend_to_kernel_cls(
return TritonExperts
elif backend == UnquantizedMoeBackend.BATCHED_TRITON:
- from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+ from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import (
BatchedTritonExperts,
)
diff --git a/vllm/model_executor/layers/quantization/humming.py b/vllm/model_executor/layers/quantization/humming.py
index 79a1057c6003..063f91203267 100644
--- a/vllm/model_executor/layers/quantization/humming.py
+++ b/vllm/model_executor/layers/quantization/humming.py
@@ -58,7 +58,7 @@
)
from humming.utils.weight import quantize_weight
- from vllm.model_executor.layers.fused_moe.fused_humming_moe import (
+ from vllm.model_executor.layers.fused_moe.experts.fused_humming_moe import (
BatchedHummingGroupedExperts,
HummingGroupedExperts,
HummingIndexedExperts,
@@ -182,7 +182,7 @@ def compressed_tensors_get_config(config: dict[str, Any], key: str):
class HummingConfig(QuantizationConfig):
- packed_modules_mapping = {}
+ packed_modules_mapping: dict[str, list[str]] = {}
def __init__(self, full_config: dict[str, Any] | None = None):
assert_humming_available()
diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index a352cc116f57..9b11d1df859c 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -13,10 +13,10 @@
from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
from vllm.model_executor.layers.quantization.online.mxfp8 import Mxfp8OnlineLinearMethod