diff --git a/vllm/envs.py b/vllm/envs.py index d7956a2adff6..45f3a4b9a4c3 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -481,6 +481,17 @@ def get_vllm_port() -> Optional[int]: lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"])) if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None, + # If set, vllm will force use Triton compressed tensor MOE kernel; + # otherwise it will use the cutlass based one. + "VLLM_TRITON_COMPRESSED_TENSORS_MOE_KERNEL": + lambda: bool( + int(os.getenv("VLLM_TRITON_COMPRESSED_TENSORS_MOE_KERNEL", "0"))), + + # If set, vllm will force flashinfer to use tensor cores; + # otherwise will use heuristic based on model architecture. + "VLLM_FLASHINFER_FORCE_TENSOR_CORES": + lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))), + # Pipeline stage partition strategy "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index c2b884c058d3..87d611b67999 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -523,8 +523,10 @@ def __init__( # cutlass path self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100( self.weight_quant, self.input_quant) - self.use_cutlass = (quant_config._is_fp8_w8a8_sm90( - self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100) + self.use_cutlass = ((quant_config._is_fp8_w8a8_sm90( + self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100) and + not envs.VLLM_TRITON_COMPRESSED_TENSORS_MOE_KERNEL) + self.fused_experts = None # type: ignore[assignment] self.disable_expert_map = False def create_weights(self, layer: torch.nn.Module, num_experts: int,