diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index 090bb729be0c..2654b323ff06 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -167,9 +167,6 @@ FusedMoEExpertsModular performs the core of the FusedMoE operations. The various `FusedMoEExpertsModular::activation_formats()`: Return the supported Input and Output activation formats. i.e. Contiguous / Batched format. -`FusedMoEExpertsModular::supports_chunking()`: Return True if the implementation supports chunking. Typically -implementations that input `FusedMoEActivationFormat.Standard` support chunking and `FusedMoEActivationFormat.BatchedExperts` do not. - `FusedMoEExpertsModular::supports_expert_map()`: Return True if the implementation supports expert map. `FusedMoEExpertsModular::workspace_shapes()` / @@ -220,8 +217,8 @@ If you are adding some `FusedMoEPrepareAndFinalizeModular` / `FusedMoEExpertsMod 1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively. 2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`, -`Config::is_fe_16bit_supported()`, `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`, -`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py) +`Config::is_fe_16bit_supported()`, `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()` +methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py) Doing this will add the new implementation to the test suite. diff --git a/tests/kernels/moe/modular_kernel_tools/cli_args.py b/tests/kernels/moe/modular_kernel_tools/cli_args.py index 375dfa748956..544dac330873 100644 --- a/tests/kernels/moe/modular_kernel_tools/cli_args.py +++ b/tests/kernels/moe/modular_kernel_tools/cli_args.py @@ -82,11 +82,6 @@ def to_quant_torch_dtype(s: str) -> torch.dtype: "--num-experts", type=int, default=32, help="Global num experts" ) parser.add_argument("--topk", nargs="+", type=int, default=[4, 1], help="num topk") - parser.add_argument( - "--fused-moe-chunk-size", - type=int, - help="Fused moe chunk size used for the non-batched fused experts impl.", - ) # Quant args parser.add_argument( @@ -158,7 +153,6 @@ def make_config(args: argparse.Namespace) -> Config: quant_config=quant_config, prepare_finalize_type=args.pf_type, fused_experts_type=args.experts_type, - fused_moe_chunk_size=args.fused_moe_chunk_size, world_size=args.world_size, torch_trace_dir_path=args.torch_trace_dir_path, ) diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 4b2b1653babe..9f17547d103a 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -68,7 +68,6 @@ class Config: prepare_finalize_type: mk.FusedMoEPrepareAndFinalize fused_experts_type: mk.FusedMoEExperts - fused_moe_chunk_size: int | None world_size: int torch_trace_dir_path: str | None = None @@ -89,7 +88,6 @@ def describe(self) -> str: s += f" K={self.K}\n" s += f" topk={self.topks}\n" s += f" dtype={self.dtype}\n" - s += f" fused_moe_chunk_size={self.fused_moe_chunk_size}\n" s += " Quant:\n" if self.quant_config is not None: s += f" q_dtype={self.quant_dtype}\n" @@ -152,11 +150,6 @@ def make_env_data(self) -> tuple[VllmConfig, dict[Any, Any]]: vllm_config.parallel_config.all2all_backend = self.all2all_backend() - if self.fused_moe_chunk_size is not None: - env_dict.update( - {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)} - ) - return vllm_config, env_dict def is_fp8_block_quantized(self): @@ -189,10 +182,6 @@ def is_block_quant_supported(self): info = expert_info(self.fused_experts_type) return info.blocked_quantization_support - def is_fe_supports_chunking(self): - info = expert_info(self.fused_experts_type) - return info.supports_chunking - def supports_expert_map(self): info = expert_info(self.fused_experts_type) return info.supports_expert_map @@ -233,10 +222,6 @@ def is_valid(self) -> tuple[bool, str | None]: if not self.is_standard_fused_experts(): return False, "Mismatched format." - use_chunking = self.fused_moe_chunk_size is not None - if use_chunking and not self.is_fe_supports_chunking(): - return False, "Chunking not supported." - # Check quantization sanity if ( int(self.is_per_act_token_quant) diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py index 08e50c52cbed..aa111b456055 100644 --- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py +++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py @@ -42,12 +42,6 @@ def rank_worker( ): set_random_seed(pgi.rank) - # sanity check - from vllm import envs - - if config.fused_moe_chunk_size is not None: - assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE - # get weights to this device weights.to_current_device() @@ -135,7 +129,6 @@ def add_to_results( fused_experts_type=experts_type, quant_config=quant_config, world_size=2, - fused_moe_chunk_size=None, ) success = None diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index ee4190859e4c..38a9857ccfed 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -64,7 +64,6 @@ class ExpertInfo: activation_format: mk.FusedMoEActivationFormat supported_dtypes: list[torch.dtype | str] blocked_quantization_support: bool - supports_chunking: bool supports_expert_map: bool needs_matching_quant: bool = False needs_deep_gemm: bool = False @@ -127,7 +126,6 @@ def register_experts( activation_format: mk.FusedMoEActivationFormat, supported_dtypes: list[torch.dtype | str], blocked_quantization_support: bool, - supports_chunking: bool, supports_expert_map: bool, needs_matching_quant: bool = False, needs_deep_gemm: bool = False, @@ -141,7 +139,6 @@ def register_experts( activation_format, supported_dtypes, blocked_quantization_support, - supports_chunking, supports_expert_map, needs_matching_quant, needs_deep_gemm, @@ -176,7 +173,6 @@ def expert_info(kind) -> ExpertInfo: batched_format, common_float_types, blocked_quantization_support=True, - supports_chunking=False, supports_expert_map=False, needs_matching_quant=True, ) @@ -186,7 +182,6 @@ def expert_info(kind) -> ExpertInfo: standard_format, common_float_and_int_types, blocked_quantization_support=True, - supports_chunking=True, supports_expert_map=True, needs_matching_quant=True, ) @@ -196,7 +191,6 @@ def expert_info(kind) -> ExpertInfo: batched_format, common_float_and_int_types, blocked_quantization_support=True, - supports_chunking=False, supports_expert_map=True, ) @@ -262,7 +256,6 @@ def expert_info(kind) -> ExpertInfo: standard_format, nvfp4_types + fp8_types, blocked_quantization_support=True, - supports_chunking=True, # Note: this is a hack to get it to run for now supports_expert_map=True, ) @@ -281,7 +274,6 @@ def expert_info(kind) -> ExpertInfo: standard_format, fp8_types, blocked_quantization_support=True, - supports_chunking=True, supports_expert_map=True, needs_aiter=True, ) @@ -294,7 +286,6 @@ def expert_info(kind) -> ExpertInfo: batched_format, fp8_types, blocked_quantization_support=True, - supports_chunking=False, supports_expert_map=False, needs_matching_quant=False, needs_deep_gemm=True, @@ -304,7 +295,6 @@ def expert_info(kind) -> ExpertInfo: standard_format, fp8_types, blocked_quantization_support=True, - supports_chunking=True, supports_expert_map=True, needs_matching_quant=False, needs_deep_gemm=True, @@ -314,7 +304,6 @@ def expert_info(kind) -> ExpertInfo: standard_format, common_float_and_int_types, blocked_quantization_support=True, - supports_chunking=True, supports_expert_map=True, needs_matching_quant=True, needs_deep_gemm=True, @@ -331,7 +320,6 @@ def expert_info(kind) -> ExpertInfo: standard_format, fp8_types, blocked_quantization_support=False, - supports_chunking=True, supports_expert_map=False, ) register_experts( @@ -339,7 +327,6 @@ def expert_info(kind) -> ExpertInfo: batched_format, fp8_types, blocked_quantization_support=False, - supports_chunking=False, supports_expert_map=False, ) else: @@ -354,7 +341,6 @@ def expert_info(kind) -> ExpertInfo: standard_format, nvfp4_types, blocked_quantization_support=True, - supports_chunking=True, supports_expert_map=False, ) else: diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py index 9f0f9f2eae19..bfc823d10a58 100644 --- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py +++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py @@ -84,12 +84,6 @@ def rank_worker( ): set_random_seed(pgi.rank) - # sanity check - from vllm import envs - - if config.fused_moe_chunk_size is not None: - assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE - # get weights to this device weights.to_current_device() diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 7011786f2a52..f27fd6f34ee7 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -158,8 +158,6 @@ def test_w8a8_block_fp8_fused_moe( torch.manual_seed(seed) - monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "2048") - a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) @@ -226,11 +224,8 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch) if not _valid_deep_gemm_shape(M, N, K): pytest.skip(f"Skipping test: invalid size m={M}, n={N}, k={K}") - chunk_size = 1024 - torch.manual_seed(seed) - monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size)) block_size = get_mk_alignment_for_contiguous_layout() dtype = torch.bfloat16 @@ -252,9 +247,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch) # setup code in case we are able to revisit this later. use_compile = False - use_cudagraph = ( - chunk_size < M and N >= 1024 and K >= 1024 and current_platform.is_cuda_alike() - ) + use_cudagraph = N >= 1024 and K >= 1024 and current_platform.is_cuda_alike() topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False) diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index c1cf8b2d3260..e06672f41d0c 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -321,7 +321,6 @@ def test_cutlass_moe_8_bit_no_graph( ep_size: int | None = None, ): set_random_seed(7) - monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch) @@ -376,7 +375,6 @@ def test_cutlass_moe_8_bit_cuda_graph( workspace_init, ): set_random_seed(7) - monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): dtype = torch.half diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 6a51853c0022..f9e7471a6436 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -204,7 +204,6 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( if not current_platform.has_device_capability(100): pytest.skip("Test is only supported for sm >= 100") set_random_seed(7) - monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): td = TestData.make_moe_tensors_8bit( m, k, n, e, is_trtllm=True, activation=activation @@ -289,7 +288,6 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( workspace_init, ): set_random_seed(7) - monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): td = TestData.make_moe_tensors_8bit( m, k, n, e, is_trtllm=False, activation=activation diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 53aed1032e11..877de845f42e 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -84,12 +84,6 @@ def rank_worker( set_random_seed(pgi.rank) - # sanity check - from vllm import envs - - if base_config.fused_moe_chunk_size is not None: - assert base_config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE - # get weights to this device weights.to_current_device() @@ -162,7 +156,6 @@ def run(config: Config, verbose: bool): TOPKs = [4, 1] Es = [32] DTYPEs = [torch.bfloat16] -FUSED_MOE_CHUNK_SIZES = [None, 16] def is_nyi_config(config: Config) -> bool: @@ -185,14 +178,13 @@ def generate_valid_test_cases( cases = [] total = 0 - for k, n, e, dtype, quant_config, combination, chunk_size in product( + for k, n, e, dtype, quant_config, combination in product( Ks, Ns, Es, DTYPEs, MK_QUANT_CONFIGS, product(prepare_finalize_types, MK_FUSED_EXPERT_TYPES), - FUSED_MOE_CHUNK_SIZES, ): total = total + 1 @@ -206,7 +198,6 @@ def generate_valid_test_cases( quant_config=quant_config, prepare_finalize_type=combination[0], fused_experts_type=combination[1], - fused_moe_chunk_size=chunk_size, world_size=world_size, ) @@ -234,7 +225,6 @@ def generate_valid_test_cases( quant_config, combination[0], combination[1], - chunk_size, world_size, ) ) @@ -245,7 +235,7 @@ def generate_valid_test_cases( @pytest.mark.parametrize( - "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,chunk_size,world_size", + "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,world_size", generate_valid_test_cases( world_size=2, prepare_finalize_types=MK_MULTI_GPU_PREPARE_FINALIZE_TYPES ), @@ -259,7 +249,6 @@ def test_modular_kernel_combinations_multigpu( quant_config: TestMoEQuantConfig | None, prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, fused_experts_type: mk.FusedMoEExperts, - chunk_size: int | None, world_size: int, pytestconfig, ): @@ -280,7 +269,6 @@ def test_modular_kernel_combinations_multigpu( quant_config=quant_config, prepare_finalize_type=prepare_finalize_type, fused_experts_type=fused_experts_type, - fused_moe_chunk_size=chunk_size, world_size=world_size, ) verbosity = pytestconfig.getoption("verbose") @@ -288,7 +276,7 @@ def test_modular_kernel_combinations_multigpu( @pytest.mark.parametrize( - "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,chunk_size,world_size", + "k,n,e,dtype,quant_config,prepare_finalize_type,fused_experts_type,world_size", generate_valid_test_cases( world_size=1, prepare_finalize_types=MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES ), @@ -301,7 +289,6 @@ def test_modular_kernel_combinations_singlegpu( quant_config: TestMoEQuantConfig | None, prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, fused_experts_type: mk.FusedMoEExperts, - chunk_size: int | None, world_size: int, pytestconfig, workspace_init, @@ -318,7 +305,6 @@ def test_modular_kernel_combinations_singlegpu( quant_config=quant_config, prepare_finalize_type=prepare_finalize_type, fused_experts_type=fused_experts_type, - fused_moe_chunk_size=chunk_size, world_size=world_size, ) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 43bdd03cfe13..81bb36eaaf90 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -287,7 +287,6 @@ def run_moe_test( @pytest.mark.parametrize("ep_size", EP_SIZE) @pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("padding", [True, False]) -@pytest.mark.parametrize("chunk_size", [8192]) def test_fused_moe( m: int, n: int, @@ -297,14 +296,11 @@ def test_fused_moe( ep_size: int, dtype: torch.dtype, padding: bool, - chunk_size: int, monkeypatch, workspace_init, ): set_random_seed(7) - monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size)) - # # Setup test data # @@ -398,12 +394,12 @@ def m_fused_moe( ) -def test_fused_moe_int64_overflow(monkeypatch, workspace_init): +def test_fused_moe_int64_overflow(workspace_init): """Regression test for int32 overflow in stride*offset products. - When chunking is disabled and M is large, stride_cm * offs_token can - exceed int32 max. Verifies the offs_token int64 cast (fix for #34413) - prevents overflow and produces correct results. + With large M, stride_cm * offs_token can exceed int32 max. Verifies + the offs_token int64 cast (fix for #34413) prevents overflow and + produces correct results. Reproduces the scenario from PR #34279. """ @@ -417,9 +413,6 @@ def test_fused_moe_int64_overflow(monkeypatch, workspace_init): m, n, k, e, topk = 100000, 2048, 1024, 8, 6 dtype = torch.bfloat16 - # Disable chunking to expose the overflow-prone code path - monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "10000000") - a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 @@ -452,7 +445,6 @@ def test_fused_moe_int64_overflow(monkeypatch, workspace_init): @pytest.mark.parametrize("topk", TOP_KS_SMALL) @pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("padding", [True, False]) -@pytest.mark.parametrize("chunk_size", [8192]) def test_naive_block_assignment_moe( m: int, n: int, @@ -461,14 +453,11 @@ def test_naive_block_assignment_moe( topk: int, dtype: torch.dtype, padding: bool, - chunk_size: int, monkeypatch, workspace_init, ): set_random_seed(7) - monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size)) - # # Setup test data # diff --git a/vllm/envs.py b/vllm/envs.py index 716810da1c27..9cebb1529a7b 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -53,8 +53,6 @@ VLLM_CPU_SGL_KERNEL: bool = False VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_XLA_CHECK_RECOMPILATION: bool = False - VLLM_FUSED_MOE_CHUNK_SIZE: int = 16 * 1024 - VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True @@ -821,15 +819,6 @@ def _get_or_set_default() -> str: ), # Enable SPMD mode for TPU backend. "VLLM_XLA_USE_SPMD": lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))), - "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int( - os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(16 * 1024)) - ), - # Control whether to use fused MoE activation chunking. Current chunking - # logic is incompatible with torch.compile and causes IMA. See issue - # https://github.com/vllm-project/vllm/issues/19631. - "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": lambda: bool( - int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1")) - ), # If set, the OpenAI API server will stay alive even after the underlying # AsyncLLMEngine errors and stops serving requests "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool( diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index eff05b575856..78876ef7c9b0 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -190,9 +190,8 @@ def wrapper(*args, **kwargs): use_int8_w8a16=False, use_int4_w4a16=False, ) - CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_tokens = hidden_states.size(0) - M = min(num_tokens, CHUNK_SIZE) + M = num_tokens max_lora_rank = self.w13_lora_a_stacked[0].shape[-2] shrink_config, expand_config = self._get_lora_moe_configs( op_prefix="w13", @@ -281,9 +280,8 @@ def wrapper(*args, **kwargs): use_int8_w8a16=False, use_int4_w4a16=False, ) - CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_tokens = hidden_states.size(0) - M = min(num_tokens, CHUNK_SIZE) + M = num_tokens max_lora_rank = self.w2_lora_a_stacked[0].shape[-2] shrink_config, expand_config = self._get_lora_moe_configs( op_prefix="w2", diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 539712587a71..0e1481ef720d 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -311,9 +311,6 @@ def _supports_activation(activation: MoEActivation) -> bool: def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: return True - def supports_chunking(self) -> bool: - return False - def supports_expert_map(self) -> bool: return False diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 64848bf931ae..69a30f89ef72 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -400,9 +400,6 @@ def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bo or moe_parallel_config.use_deepep_ht_kernels ) - def supports_chunking(self) -> bool: - return True - def supports_expert_map(self) -> bool: return False @@ -445,9 +442,6 @@ def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bo def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.BatchedExperts - def supports_chunking(self) -> bool: - return False - def supports_expert_map(self) -> bool: return False @@ -713,9 +707,6 @@ def activation_format() -> mk.FusedMoEActivationFormat: def supports_expert_map(self) -> bool: return False - def supports_chunking(self) -> bool: - return True - def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: return TopKWeightAndReduceNoOP() @@ -998,9 +989,6 @@ def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bo "This method should not be called." ) - def supports_chunking(self) -> bool: - return True - def supports_expert_map(self) -> bool: return True diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 8af439a0d435..18b3da34422e 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -154,9 +154,6 @@ def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bo # NOTE(rob): discovered an IMA with this combination. Needs investigation. return not moe_parallel_config.use_fi_all2allv_kernels - def supports_chunking(self) -> bool: - return True - def supports_expert_map(self) -> bool: return True diff --git a/vllm/model_executor/layers/fused_moe/fallback.py b/vllm/model_executor/layers/fused_moe/fallback.py index 403a71e20761..40741d52af50 100644 --- a/vllm/model_executor/layers/fused_moe/fallback.py +++ b/vllm/model_executor/layers/fused_moe/fallback.py @@ -92,16 +92,6 @@ def _supports_parallel_config( moe_parallel_config ) and fallback_cls._supports_parallel_config(moe_parallel_config) - def supports_chunking(self) -> bool: - assert ( - self.experts.supports_chunking() - == self.fallback_experts.supports_chunking() - ) - return ( - self.experts.supports_chunking() - and self.fallback_experts.supports_chunking() - ) - def supports_expert_map(self) -> bool: assert ( self.experts.supports_expert_map() diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py index 730dc0c5df3c..fb8a18ef33e5 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py @@ -83,12 +83,6 @@ def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bo def supports_expert_map(self) -> bool: return False - def supports_chunking(self) -> bool: - # This refers to TP chunking; DP chunking is handled separately. - # TODO(shuw@nvidia.com): Set to False to be consistent with - # batched_deep_gemm_moe - return False - def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: # Let PrepareAndFinalize::finalize() decide the impl. return TopKWeightAndReduceDelegate() diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 4ee2aab25068..e58d52eee980 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -195,10 +195,6 @@ def activation_format() -> mk.FusedMoEActivationFormat: def supports_expert_map(self) -> bool: return False - def supports_chunking(self) -> bool: - # This refers to TP chunking; DP chunking is handled separately. - return True - def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: return TopKWeightAndReduceNoOP() diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index b6441552a4e1..9df94b72d246 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -712,9 +712,6 @@ def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bo "This method should not be called." ) - def supports_chunking(self) -> bool: - return False - def supports_expert_map(self) -> bool: return False @@ -957,9 +954,6 @@ def _supports_activation(activation: MoEActivation) -> bool: def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: return True - def supports_chunking(self) -> bool: - return False - def supports_expert_map(self) -> bool: return False diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 5370b9e28bd2..86fef2528345 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -658,9 +658,6 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard - def supports_chunking(self) -> bool: - return True - def workspace_shapes( self, M: int, @@ -786,9 +783,6 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.BatchedExperts - def supports_chunking(self) -> bool: - return False - def workspace_shapes( self, M: int, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index ee321f241aad..660f9a1722e9 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1693,10 +1693,8 @@ def fused_experts_impl( if global_num_experts == -1: global_num_experts = E top_k_num = topk_ids.size(1) - # We execute the fused_moe kernel in chunks to circumvent this issue: - # https://github.com/vllm-project/vllm/issues/5938 - CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE - M = min(num_tokens, CHUNK_SIZE) + + M = num_tokens config_dtype = _get_config_dtype_str( use_fp8_w8a8=use_fp8_w8a8, @@ -1787,139 +1785,114 @@ def fused_experts_impl( else: raise NotImplementedError(f"Unsupported ocp_mx_scheme={ocp_mx_scheme}") - for chunk in range((num_tokens // CHUNK_SIZE) + 1): - begin_chunk_idx, end_chunk_idx = ( - chunk * CHUNK_SIZE, - min((chunk + 1) * CHUNK_SIZE, num_tokens), - ) - curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx] - tokens_in_chunk, _ = curr_hidden_states.size() - - if tokens_in_chunk == 0: - break - - if tokens_in_chunk < CHUNK_SIZE and chunk > 0: - # Adjust the intermediate cache size and config for the last - # chunk. Note that in most cases we only have one chunk - # so the cache size and config are already set correctly and - # do not need to be adjusted. - intermediate_cache1 = intermediate_cache1[:tokens_in_chunk] - intermediate_cache2 = intermediate_cache2[ - : tokens_in_chunk * topk_ids.size(1) - ] - intermediate_cache3 = intermediate_cache3[:tokens_in_chunk] - config = get_config_func(tokens_in_chunk) - - curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] - curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] - qcurr_hidden_states, a1q_scale = moe_kernel_quantize_input( - A=curr_hidden_states, - A_scale=a1_scale, - quant_dtype=quant_dtype, - per_act_token_quant=per_channel_quant, - block_shape=block_shape, - ocp_mx_scheme=ocp_mx_scheme, - ) + qhidden_states, a1q_scale = moe_kernel_quantize_input( + A=hidden_states, + A_scale=a1_scale, + quant_dtype=quant_dtype, + per_act_token_quant=per_channel_quant, + block_shape=block_shape, + ocp_mx_scheme=ocp_mx_scheme, + ) - # SPARSITY_FACTOR is a heuristic margin ensuring tokens_in_chunk * top_k - # activates only a small fraction of total experts - SPARSITY_FACTOR = 4 - # block quantized code path is not implemented yet. - naive_block_assignment = ( - expert_map is None - and tokens_in_chunk * top_k_num * SPARSITY_FACTOR <= global_num_experts - and not ( - (use_int8_w8a16 or use_int4_w4a16) - and block_shape is not None - and block_shape[1] > 0 - ) + # SPARSITY_FACTOR is a heuristic margin ensuring num_tokens * top_k + # activates only a small fraction of total experts + SPARSITY_FACTOR = 4 + # block quantized code path is not implemented yet. + naive_block_assignment = ( + expert_map is None + and num_tokens * top_k_num * SPARSITY_FACTOR <= global_num_experts + and not ( + (use_int8_w8a16 or use_int4_w4a16) + and block_shape is not None + and block_shape[1] > 0 ) + ) - if not naive_block_assignment: - sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( - curr_topk_ids, - config["BLOCK_SIZE_M"], - global_num_experts, - expert_map, - ignore_invalid_experts=True, - ) - else: - max_num_tokens_padded = topk_ids.numel() * config["BLOCK_SIZE_M"] - expert_ids = curr_topk_ids.view(-1) - num_tokens_post_padded = torch.empty( - (1), dtype=torch.int32, device=topk_ids.device - ) - num_tokens_post_padded.fill_(max_num_tokens_padded) - sorted_token_ids = None - - dispatch_fused_moe_kernel( - qcurr_hidden_states, - w1, - intermediate_cache1, - a1q_scale, - w1_scale, - w1_zp, - curr_topk_weights, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - apply_router_weight_on_input, - top_k_num, - config, - compute_type=compute_type, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - per_channel_quant=per_channel_quant, - block_shape=block_shape, - B_bias=w1_bias, + if not naive_block_assignment: + sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( + topk_ids, + config["BLOCK_SIZE_M"], + global_num_experts, + expert_map, + ignore_invalid_experts=True, ) - - apply_moe_activation( - activation_enum, intermediate_cache2, intermediate_cache1.view(-1, N) + else: + max_num_tokens_padded = topk_ids.numel() * config["BLOCK_SIZE_M"] + expert_ids = topk_ids.view(-1) + num_tokens_post_padded = torch.empty( + (1), dtype=torch.int32, device=topk_ids.device ) + num_tokens_post_padded.fill_(max_num_tokens_padded) + sorted_token_ids = None - qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( - A=intermediate_cache2, - A_scale=a2_scale, - quant_dtype=quant_dtype, - per_act_token_quant=per_channel_quant, - block_shape=block_shape, - ocp_mx_scheme=ocp_mx_scheme, - ) + dispatch_fused_moe_kernel( + qhidden_states, + w1, + intermediate_cache1, + a1q_scale, + w1_scale, + w1_zp, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + apply_router_weight_on_input, + top_k_num, + config, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + B_bias=w1_bias, + ) - if expert_map is not None: - intermediate_cache3.zero_() + apply_moe_activation( + activation_enum, intermediate_cache2, intermediate_cache1.view(-1, N) + ) - dispatch_fused_moe_kernel( - qintermediate_cache2, - w2, - intermediate_cache3, - a2q_scale, - w2_scale, - w2_zp, - curr_topk_weights, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - not apply_router_weight_on_input, - 1, - config, - compute_type=compute_type, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - per_channel_quant=per_channel_quant, - block_shape=block_shape, - B_bias=w2_bias, - ) + qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( + A=intermediate_cache2, + A_scale=a2_scale, + quant_dtype=quant_dtype, + per_act_token_quant=per_channel_quant, + block_shape=block_shape, + ocp_mx_scheme=ocp_mx_scheme, + ) - ops.moe_sum( - intermediate_cache3.view(*intermediate_cache3.size()), - out_hidden_states[begin_chunk_idx:end_chunk_idx], - ) + if expert_map is not None: + intermediate_cache3.zero_() + + dispatch_fused_moe_kernel( + qintermediate_cache2, + w2, + intermediate_cache3, + a2q_scale, + w2_scale, + w2_zp, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + not apply_router_weight_on_input, + 1, + config, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + B_bias=w2_bias, + ) + + ops.moe_sum( + intermediate_cache3.view(*intermediate_cache3.size()), + out_hidden_states, + ) return out_hidden_states @@ -1992,9 +1965,6 @@ def _supports_activation(activation: MoEActivation) -> bool: def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: return not moe_parallel_config.use_fi_all2allv_kernels - def supports_chunking(self) -> bool: - return True - def supports_expert_map(self) -> bool: return True diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 8d6f716e2632..82b0a21cba93 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -609,9 +609,6 @@ class OAITritonExperts(BaseOAITritonExperts): def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard - def supports_chunking(self) -> bool: - return True - def workspace_shapes( self, M: int, @@ -696,9 +693,6 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts): def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard - def supports_chunking(self) -> bool: - return True - def workspace_shapes( self, M: int, diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index d8c95727cdc6..7100c87c91c7 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -9,8 +9,6 @@ import torch -import vllm.envs as envs -from vllm.forward_context import get_forward_context, is_forward_context_available from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.activation import ( MoEActivation, @@ -24,14 +22,12 @@ ) from vllm.model_executor.layers.fused_moe.utils import ( _resize_cache, - count_expert_num_tokens, disable_inplace, ) from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, ) from vllm.platforms import current_platform -from vllm.utils.math_utils import cdiv from vllm.v1.worker.ubatching import ( dbo_enabled, dbo_maybe_run_recv_hook, @@ -719,15 +715,6 @@ def g1_alphas(self) -> torch.Tensor | None: def g2_alphas(self) -> torch.Tensor | None: return self.quant_config.g2_alphas - # TODO (bnell): make this return a CHUNK_SIZE or None instead? - @abstractmethod - def supports_chunking(self) -> bool: - """ - A flag indicating whether or not this class supports activation - chunking. - """ - raise NotImplementedError - @abstractmethod def supports_expert_map(self) -> bool: """ @@ -742,11 +729,6 @@ def supports_packed_ue8m0_act_scales(self) -> bool: """ return False - def enable_chunking(self): - return ( - envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and self.supports_chunking() - ) - class FusedMoEExpertsModular(FusedMoEExperts): """ @@ -995,17 +977,6 @@ def apply( raise NotImplementedError -def _slice_scales( - scales: torch.Tensor | None, start: int, end: int -) -> torch.Tensor | None: - if scales is not None: - if scales.numel() == 1: - return scales - else: - return scales[start:end] - return None - - ################################################################################ # Kernel ################################################################################ @@ -1032,26 +1003,6 @@ def __init__( and moe_parallel_config.use_ep ) - def _chunk_info(self, M: int) -> tuple[int, int]: - """ - Compute number of chunks and chunk size for given M. - If chunking is not supported, set the CHUNK_SIZE to M so we - get num_chunks == 1. Take max(M, 1) to avoid divide by zero. - If there are no tokens to process, the number of chunks will be zero. - """ - CHUNK_SIZE = max( - 1, - ( - M - if not self.fused_experts.enable_chunking() - else min(M, envs.VLLM_FUSED_MOE_CHUNK_SIZE) - ), - ) - num_chunks = cdiv(M, CHUNK_SIZE) - # If there are no tokens, then there should be no loop iterations. - assert M > 0 or num_chunks == 0 - return num_chunks, CHUNK_SIZE - def _allocate_buffers( self, out_dtype: torch.dtype, @@ -1076,40 +1027,8 @@ def _allocate_buffers( """ assert M_full > 0 and M_chunk > 0 - num_chunks, _ = self._chunk_info(M_full) workspace_dtype = self.fused_experts.workspace_dtype(out_dtype) - # Force worst-case allocation in profiling run for - # "mk.FusedMoEKernel.Standard" formats where this is only bounded - # by `VLLM_FUSED_MOE_CHUNK_SIZE` and may not be seen during profiling with - # DP+EP due to the random token routing. - is_profile_run = ( - is_forward_context_available() - and get_forward_context().attn_metadata is None - ) - if is_profile_run and self.fused_experts.enable_chunking() and self.is_dp_ep: - max_workspace_13, max_workspace_2, max_fused_out_shape = ( - self.fused_experts.workspace_shapes( - envs.VLLM_FUSED_MOE_CHUNK_SIZE, - N, - K, - top_k, - global_num_experts, - local_num_experts, - # expert_tokens_meta help in allocating optimal/minimal - # amount of workspace. Mark it None, so we allocate for - # the worst-case scenario. - expert_tokens_meta=None, - activation=activation, - ) - ) - - current_workspace_manager().get_simultaneous( - (max_workspace_13, workspace_dtype), - (max_workspace_2, workspace_dtype), - (max_fused_out_shape, out_dtype), - ) - # Get intermediate workspace shapes based off the chunked M size. workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes( M_chunk, @@ -1136,79 +1055,16 @@ def _allocate_buffers( # We can reuse the memory between cache1 and cache3 because by the # time we need cache3, we're done with cache1. - # Construct the entire output that can then be processed in chunks. - # Reuse workspace13 for the output in the non-chunked case. - # This will not always be the case for standard - # format experts and with experts that have empty workspaces. - if num_chunks == 1: - max_shape_size = max(prod(workspace13_shape), prod(fused_out_shape)) - common_workspace, workspace2 = current_workspace_manager().get_simultaneous( - ((max_shape_size,), workspace_dtype), - (workspace2_shape, workspace_dtype), - ) - workspace13 = _resize_cache(common_workspace, workspace13_shape) - fused_out = _resize_cache(common_workspace, fused_out_shape) - else: - workspace13, workspace2, fused_out = ( - current_workspace_manager().get_simultaneous( - (workspace13_shape, workspace_dtype), - (workspace2_shape, workspace_dtype), - (fused_out_shape, out_dtype), - ) - ) - - return workspace13, workspace2, fused_out - - @staticmethod - def _slice_output_tensor( - fused_out: torch.Tensor, - chunk_idx: int, - num_chunks: int, - CHUNK_SIZE: int, - M: int, - ) -> torch.Tensor: - if num_chunks == 1: - return fused_out - - assert fused_out.size(0) % M == 0, f"fused_out shape {fused_out.shape} vs M {M}" - factor = fused_out.size(0) // M - out_chunk_size = CHUNK_SIZE * factor - s = chunk_idx * out_chunk_size - e = min(s + out_chunk_size, fused_out.size(0)) - return fused_out[s:e] - - @staticmethod - def _slice_expert_tokens_metadata( - num_chunks: int, - full_expert_tokens_meta: ExpertTokensMetadata | None, - chunk_topk_ids: torch.Tensor, - local_num_experts: int, - expert_map: torch.Tensor | None, - ) -> ExpertTokensMetadata | None: - if num_chunks == 1 or full_expert_tokens_meta is None: - return full_expert_tokens_meta - - # The existing expert_num_tokens is for the entire a1q - # input. Chunking forces recomputation of the number - # of tokens assigned to each expert. - c_expert_num_tokens = count_expert_num_tokens( - chunk_topk_ids, local_num_experts, expert_map - ) - - c_expert_num_tokens_cpu = None - need_expert_num_tokens_cpu = ( - full_expert_tokens_meta.expert_num_tokens_cpu is not None + # Reuse workspace13 for the output since there is only one chunk. + max_shape_size = max(prod(workspace13_shape), prod(fused_out_shape)) + common_workspace, workspace2 = current_workspace_manager().get_simultaneous( + ((max_shape_size,), workspace_dtype), + (workspace2_shape, workspace_dtype), ) - if need_expert_num_tokens_cpu: - # This is blocking as some implementations need the count - # on the CPU to determine appropriate input/out fused-moe - # buffers - c_expert_num_tokens_cpu = c_expert_num_tokens.to("cpu", non_blocking=False) + workspace13 = _resize_cache(common_workspace, workspace13_shape) + fused_out = _resize_cache(common_workspace, fused_out_shape) - return ExpertTokensMetadata( - expert_num_tokens=c_expert_num_tokens, - expert_num_tokens_cpu=c_expert_num_tokens_cpu, - ) + return workspace13, workspace2, fused_out def _prepare( self, @@ -1318,18 +1174,6 @@ def _fused_experts( a1q, w1, w2, topk_ids ) - num_chunks, CHUNK_SIZE = self._chunk_info(M_full) - - def input_chunk_range(chunk_idx: int) -> tuple[int, int]: - if num_chunks == 1: - # Use a1q.size(0) here since batched format does not - # keep M in the first dimension. - return 0, a1q.size(0) - else: - s = chunk_idx * CHUNK_SIZE - e = min(s + CHUNK_SIZE, M_full) - return s, e - # This happens when none of the tokens from the all2all reach this # EP rank. Also, note that this is only relevant for CUDAGraph # incompatible all2all kernels like the DeepEP high-throughput @@ -1337,58 +1181,39 @@ def input_chunk_range(chunk_idx: int) -> tuple[int, int]: # low-latency kernels are always batched and can never run into # the tensor.numel() == 0 case. if M_full == 0: - assert num_chunks == 0 - workspace13 = None - workspace2 = None - fused_out = torch.empty_like(a1q, dtype=in_dtype) - else: - assert num_chunks > 0 - workspace13, workspace2, fused_out = self._allocate_buffers( - in_dtype, - a1q.device, - CHUNK_SIZE, - M_full, - N, - K, - top_k, - global_num_experts, - local_num_experts, - expert_tokens_meta, - activation, - ) + return torch.empty_like(a1q, dtype=in_dtype) - for chunk_idx in range(num_chunks): - s, e = input_chunk_range(chunk_idx) - - c_expert_tokens_meta = self._slice_expert_tokens_metadata( - num_chunks, - expert_tokens_meta, - topk_ids[s:e], - local_num_experts, - expert_map, - ) - - c_fused_out = self._slice_output_tensor( - fused_out, chunk_idx, num_chunks, CHUNK_SIZE, M_full - ) + workspace13, workspace2, fused_out = self._allocate_buffers( + in_dtype, + a1q.device, + M_full, + M_full, + N, + K, + top_k, + global_num_experts, + local_num_experts, + expert_tokens_meta, + activation, + ) - self.fused_experts.apply( - output=c_fused_out, - hidden_states=a1q[s:e], - w1=w1, - w2=w2, - topk_weights=topk_weights[s:e], - topk_ids=topk_ids[s:e], - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - a1q_scale=_slice_scales(a1q_scale, s, e), - a2_scale=_slice_scales(self.fused_experts.a2_scale, s, e), - workspace13=workspace13, - workspace2=workspace2, - expert_tokens_meta=c_expert_tokens_meta, - apply_router_weight_on_input=apply_router_weight_on_input, - ) + self.fused_experts.apply( + output=fused_out, + hidden_states=a1q, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + a1q_scale=a1q_scale, + a2_scale=self.fused_experts.a2_scale, + workspace13=workspace13, + workspace2=workspace2, + expert_tokens_meta=expert_tokens_meta, + apply_router_weight_on_input=apply_router_weight_on_input, + ) return fused_out diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index c550cad9e892..6d178d587c69 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -337,9 +337,6 @@ def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bo def supports_expert_map(self): return True - def supports_chunking(self): - return False - def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: return TopKWeightAndReduceNoOP() diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index 5160840a2f31..fed583472ed1 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -83,9 +83,6 @@ def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bo "This method should not be called." ) - def supports_chunking(self) -> bool: - return True - def supports_expert_map(self) -> bool: return True diff --git a/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py index 0693a25468fd..b8d3ffec3276 100644 --- a/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py @@ -79,9 +79,6 @@ def _supports_quant_scheme( ] return (weight_key, activation_key) in SUPPORTED_W_A - def supports_chunking(self) -> bool: - return False - def supports_expert_map(self) -> bool: return True diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 41854b628133..0b6b3327868a 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -244,8 +244,7 @@ def _get_grouped_gemm_params( device = w1.device # Assumes all ranks have the same max_num_batched_tokens - max_tokens_across_dp = get_dp_group().world_size * max_tokens - max_tokens = min(max_tokens_across_dp, envs.VLLM_FUSED_MOE_CHUNK_SIZE) + max_tokens = get_dp_group().world_size * max_tokens # This is the maximum GroupedGemm M size that we expect to run # the grouped_gemm with.