diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 65bc38c6c755..4463a23772ee 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -27,10 +27,10 @@ RoutingMethodType, _get_config_dtype_str, ) -from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, ) +from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.transformers_utils.config import get_config from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 2bbadd8ff813..1a11c6685a45 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -81,7 +81,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k | Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source | | ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ | | triton | standard | all1 | G,A,T | silu, gelu,
swigluoai,
silu_no_mul,
gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],
[`TritonExperts`][vllm.model_executor.layers.fused_moe.experts.triton_moe.TritonExperts] | -| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | +| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.experts.fused_batched_moe.BatchedTritonExperts] | | deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y |
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] | | cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp4] | | cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassBatchedExpertsFp8] | @@ -91,7 +91,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k | trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],
[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],
[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],
[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] | | rocm aiter moe | standard | mxfp4,
fp8 | G(32),G(128),A,T | silu, gelu,
swigluoai | Y | N | `rocm_aiter_fused_experts`,
`AiterExperts` | | cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] | -| naive batched4 | batched | int8,
fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] | +| naive batched4 | batched | int8,
fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.experts.fused_batched_moe.NaiveBatchedExperts] | !!! info "Table key" 1. All types: mxfp4, nvfp4, int4, int8, fp8 diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index 111975cf6bae..5d3572b7caa2 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -15,16 +15,16 @@ BatchedDeepGemmExperts, ) from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts -from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( +from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import ( BatchedTritonExperts, NaiveBatchedExperts, ) +from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import ( + TritonOrDeepGemmExperts, +) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoDPEPModular, ) -from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( - TritonOrDeepGemmExperts, -) from vllm.model_executor.layers.quantization.utils.nvfp4_utils import ( cutlass_fp4_supported, ) diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py index 4c8b2d87d61f..ab5672f83203 100644 --- a/tests/kernels/moe/test_batched_deepgemm.py +++ b/tests/kernels/moe/test_batched_deepgemm.py @@ -9,7 +9,7 @@ from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( BatchedDeepGemmExperts, ) -from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( +from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import ( BatchedTritonExperts, ) from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index d78e1947fac0..b9fe8ceafcdd 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -16,7 +16,7 @@ from tests.kernels.utils import torch_experts from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe import fused_topk -from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( +from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import ( invoke_moe_batched_triton_kernel, ) from vllm.platforms import current_platform diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 0181fc252476..d11573bd31d4 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -31,7 +31,7 @@ from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( _valid_deep_gemm_shape, ) -from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, ) from vllm.platforms import current_platform diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index d04c3c99ce77..fd6dda384c11 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -19,7 +19,9 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts +from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import ( + BatchedTritonExperts, +) from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index fd05759ac3df..9095bbc2c76c 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -25,10 +25,10 @@ FusedMoEQuantDesc, fp8_w8a8_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts -from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, ) +from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, ) diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index de3a456fb449..acb2c21b3896 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -17,13 +17,13 @@ FusedMoEQuantConfig, RoutingMethodType, ) -from vllm.model_executor.layers.fused_moe.experts.triton_moe import ( - TritonExperts, -) -from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( +from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import ( BatchedTritonExperts, NaiveBatchedExperts, ) +from vllm.model_executor.layers.fused_moe.experts.triton_moe import ( + TritonExperts, +) from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, ) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 8cb32f07965b..abc2f84debb2 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -10,10 +10,10 @@ from vllm.distributed.utils import divide from vllm.lora.layers.base import BaseLayerWithLoRA from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe.experts.lora_context import MoELoRAContext from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( FusedMoEModularMethod, ) -from vllm.model_executor.layers.fused_moe.lora_context import MoELoRAContext from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoDPEPModular, diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index aeb704e18d86..9938dc0dd53d 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -85,9 +85,15 @@ def get_config() -> dict[str, Any] | None: from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( DeepGemmExperts, ) + from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import ( + BatchedTritonExperts, + ) from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import ( AiterExperts, ) + from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import ( + TritonOrDeepGemmExperts, + ) from vllm.model_executor.layers.fused_moe.experts.triton_moe import ( TritonExperts, TritonWNA16Experts, @@ -97,9 +103,6 @@ def get_config() -> dict[str, Any] | None: XPUExpertsFp8, XPUExpertsMXFp4, ) - from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( - BatchedTritonExperts, - ) from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, get_config_file_name, @@ -110,9 +113,6 @@ def get_config() -> dict[str, Any] | None: from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import ( GroupedTopk, ) - from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( - TritonOrDeepGemmExperts, - ) __all__ += [ "AiterExperts", diff --git a/vllm/model_executor/layers/fused_moe/fallback.py b/vllm/model_executor/layers/fused_moe/experts/fallback.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/fallback.py rename to vllm/model_executor/layers/fused_moe/experts/fallback.py diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/experts/fused_batched_moe.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/fused_batched_moe.py rename to vllm/model_executor/layers/fused_moe/experts/fused_batched_moe.py diff --git a/vllm/model_executor/layers/fused_moe/fused_humming_moe.py b/vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/fused_humming_moe.py rename to vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py diff --git a/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py index e10514debd08..88ea58a9eca7 100644 --- a/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py @@ -14,7 +14,9 @@ FusedMoEQuantConfig, RoutingMethodType, ) -from vllm.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin +from vllm.model_executor.layers.fused_moe.experts.lora_experts_mixin import ( + LoRAExpertsMixin, +) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP, ) diff --git a/vllm/model_executor/layers/fused_moe/lora_context.py b/vllm/model_executor/layers/fused_moe/experts/lora_context.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/lora_context.py rename to vllm/model_executor/layers/fused_moe/experts/lora_context.py diff --git a/vllm/model_executor/layers/fused_moe/lora_experts_mixin.py b/vllm/model_executor/layers/fused_moe/experts/lora_experts_mixin.py similarity index 97% rename from vllm/model_executor/layers/fused_moe/lora_experts_mixin.py rename to vllm/model_executor/layers/fused_moe/experts/lora_experts_mixin.py index 10707b91b70e..2a680909d5f6 100644 --- a/vllm/model_executor/layers/fused_moe/lora_experts_mixin.py +++ b/vllm/model_executor/layers/fused_moe/experts/lora_experts_mixin.py @@ -3,7 +3,7 @@ import torch -from vllm.model_executor.layers.fused_moe.lora_context import MoELoRAContext +from vllm.model_executor.layers.fused_moe.experts.lora_context import MoELoRAContext class LoRAExpertsMixin: diff --git a/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py b/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py index 621b33dd8f30..2e831f0e2fbc 100644 --- a/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/marlin_moe.py @@ -17,7 +17,9 @@ FusedMoEParallelConfig, FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin +from vllm.model_executor.layers.fused_moe.experts.lora_experts_mixin import ( + LoRAExpertsMixin, +) from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( batched_moe_align_block_size, moe_align_block_size, diff --git a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/experts/triton_cutlass_moe.py similarity index 96% rename from vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py rename to vllm/model_executor/layers/fused_moe/experts/triton_cutlass_moe.py index ee3622b4ebe3..1753d7354cba 100644 --- a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/triton_cutlass_moe.py @@ -11,8 +11,8 @@ FusedMoEQuantConfig, ) from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8 +from vllm.model_executor.layers.fused_moe.experts.fallback import FallbackExperts from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts -from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts from vllm.platforms import current_platform diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/experts/triton_deep_gemm_moe.py similarity index 97% rename from vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py rename to vllm/model_executor/layers/fused_moe/experts/triton_deep_gemm_moe.py index fbcf030a9256..58700d714857 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/triton_deep_gemm_moe.py @@ -14,8 +14,8 @@ _valid_deep_gemm, _valid_deep_gemm_shape, ) +from vllm.model_executor.layers.fused_moe.experts.fallback import FallbackExperts from vllm.model_executor.layers.fused_moe.experts.triton_moe import TritonExperts -from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts from vllm.utils.deep_gemm import ( is_deep_gemm_e8m0_used, ) diff --git a/vllm/model_executor/layers/fused_moe/experts/triton_moe.py b/vllm/model_executor/layers/fused_moe/experts/triton_moe.py index e99f307ec0ec..b9b046aa17ab 100644 --- a/vllm/model_executor/layers/fused_moe/experts/triton_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/triton_moe.py @@ -12,15 +12,15 @@ FusedMoEParallelConfig, FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe.experts.lora_experts_mixin import ( + LoRAExpertsMixin, +) from vllm.model_executor.layers.fused_moe.fused_moe import ( _prepare_expert_assignment, invoke_fused_moe_triton_kernel, invoke_fused_moe_wna16_triton_kernel, try_get_optimal_moe_config, ) -from vllm.model_executor.layers.fused_moe.lora_experts_mixin import ( - LoRAExpertsMixin, -) from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( moe_align_block_size, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 1ce5a1e3213b..0fa706d8daaf 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -130,7 +130,7 @@ def backend_to_kernel_cls( return [FlashInferExperts] elif backend == Fp8MoeBackend.DEEPGEMM: - from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( + from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, ) @@ -158,7 +158,7 @@ def backend_to_kernel_cls( return [TritonExperts] elif backend == Fp8MoeBackend.BATCHED_TRITON: - from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import ( BatchedTritonExperts, ) @@ -172,7 +172,7 @@ def backend_to_kernel_cls( return [AiterExperts] elif backend == Fp8MoeBackend.VLLM_CUTLASS: - from vllm.model_executor.layers.fused_moe.triton_cutlass_moe import ( + from vllm.model_executor.layers.fused_moe.experts.triton_cutlass_moe import ( TritonOrCutlassExperts, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py index 4de0d722d50b..1820e3463621 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -147,7 +147,7 @@ def backend_to_kernel_cls( return [UnfusedOAITritonExperts] elif backend == Mxfp4MoeBackend.HUMMING: - from vllm.model_executor.layers.fused_moe.fused_humming_moe import ( + from vllm.model_executor.layers.fused_moe.experts.fused_humming_moe import ( BatchedHummingGroupedExperts, HummingGroupedExperts, HummingIndexedExperts, diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py index f1fd196c09c1..9215c960a8c9 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -116,7 +116,7 @@ def backend_to_kernel_cls( return TritonExperts elif backend == UnquantizedMoeBackend.BATCHED_TRITON: - from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + from vllm.model_executor.layers.fused_moe.experts.fused_batched_moe import ( BatchedTritonExperts, ) diff --git a/vllm/model_executor/layers/quantization/humming.py b/vllm/model_executor/layers/quantization/humming.py index 79a1057c6003..063f91203267 100644 --- a/vllm/model_executor/layers/quantization/humming.py +++ b/vllm/model_executor/layers/quantization/humming.py @@ -58,7 +58,7 @@ ) from humming.utils.weight import quantize_weight - from vllm.model_executor.layers.fused_moe.fused_humming_moe import ( + from vllm.model_executor.layers.fused_moe.experts.fused_humming_moe import ( BatchedHummingGroupedExperts, HummingGroupedExperts, HummingIndexedExperts, @@ -182,7 +182,7 @@ def compressed_tensors_get_config(config: dict[str, Any], key: str): class HummingConfig(QuantizationConfig): - packed_modules_mapping = {} + packed_modules_mapping: dict[str, list[str]] = {} def __init__(self, full_config: dict[str, Any] | None = None): assert_humming_available() diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index a352cc116f57..9b11d1df859c 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -13,10 +13,10 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts -from vllm.model_executor.layers.fused_moe.layer import FusedMoE -from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, ) +from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod from vllm.model_executor.layers.quantization.online.mxfp8 import Mxfp8OnlineLinearMethod