sgl-project · zhyncs · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
@@ -799,9 +799,18 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         layer.w13_weight[expert_id][start : start + shard_size, :],
                         layer.w13_weight_scale[expert_id][shard_id],
                     )
-                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
-                        ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
-                    )
+                    if _is_cuda:
+                        (
+                            layer.w13_weight[expert_id][start : start + shard_size, :],
+                            _,
+                        ) = sgl_scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                    else:
+                        (
+                            layer.w13_weight[expert_id][start : start + shard_size, :],
+                            _,
+                        ) = vllm_ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id]
+                        )
                     start += shard_size
 
             layer.w13_weight_scale = torch.nn.Parameter(

@@ -15,6 +15,13 @@
     is_hip,
 )
 
+try:
+    import vllm
+
+    VLLM_AVAILABLE = True
+except ImportError:
+    VLLM_AVAILABLE = False
+
 use_vllm_cutlass_w8a8_fp8_kernel = get_bool_env_var("USE_VLLM_CUTLASS_W8A8_FP8_KERNEL")
 
 _is_hip = is_hip()
@@ -27,13 +34,8 @@
 
     from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_quant_fp8
 
-    if use_vllm_cutlass_w8a8_fp8_kernel:
-        try:
-            from vllm import _custom_ops as ops
-
-            VLLM_AVAILABLE = True
-        except ImportError:
-            VLLM_AVAILABLE = False
+    if use_vllm_cutlass_w8a8_fp8_kernel and VLLM_AVAILABLE:
+        from vllm import _custom_ops as ops
     else:
         from sgl_kernel import fp8_scaled_mm
 
@@ -253,68 +255,69 @@ def apply_fp8_linear(
 
     # torch.scaled_mm supports per tensor weights + activations only
     # so fallback to naive if per channel or per token
-    per_tensor_weights = weight_scale.numel() == 1
-    per_tensor_activations = x_scale.numel() == 1
-
-    if per_tensor_weights and per_tensor_activations:
-        # Fused GEMM_DQ
-        output = torch._scaled_mm(
-            qinput,
-            weight,
-            out_dtype=input.dtype,
-            scale_a=x_scale,
-            scale_b=weight_scale,
-            bias=bias,
-        )
-        # A fix for discrepancy in scaled_mm which returns tuple
-        # for torch < 2.5 and a single value in torch >= 2.5
-        if type(output) is tuple and len(output) == 2:
-            output = output[0]
+    else:
+        per_tensor_weights = weight_scale.numel() == 1
+        per_tensor_activations = x_scale.numel() == 1
+
+        if per_tensor_weights and per_tensor_activations:
+            # Fused GEMM_DQ
+            output = torch._scaled_mm(
+                qinput,
+                weight,
+                out_dtype=input.dtype,
+                scale_a=x_scale,
+                scale_b=weight_scale,
+                bias=bias,
+            )
+            # A fix for discrepancy in scaled_mm which returns tuple
+            # for torch < 2.5 and a single value in torch >= 2.5
+            if type(output) is tuple and len(output) == 2:
+                output = output[0]
 
-        return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+            return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
 
-    else:
-        # Fallback for channelwise case, where we use unfused DQ
-        # due to limitations with scaled_mm
-
-        # Symmetric quantized GEMM by definition computes the following:
-        #   C = (s_x * X) (s_w * W) + bias
-        # This is equivalent to dequantizing the weights and activations
-        # before applying a GEMM.
-        #
-        # In order to compute quantized operands, a quantized kernel
-        # will rewrite the above like so:
-        #   C = s_w * s_x * (X * W) + bias
-        #
-        # For the scaled_mm fallback case, we break this down, since it
-        # does not support s_w being a vector.
-
-        # Making sure the dummy tensor is on the same device as the weight
-        global TORCH_DEVICE_IDENTITY
-        if TORCH_DEVICE_IDENTITY.device != weight.device:
-            TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device)
-
-        # GEMM
-        # This computes C = (X * W).
-        # Output in fp32 to allow subsequent ops to happen in-place
-        output = torch._scaled_mm(
-            qinput,
-            weight,
-            scale_a=TORCH_DEVICE_IDENTITY,
-            scale_b=TORCH_DEVICE_IDENTITY,
-            out_dtype=torch.float32,
-        )
-        # A fix for discrepancy in scaled_mm which returns tuple
-        # for torch < 2.5 and a single value in torch >= 2.5
-        if type(output) is tuple and len(output) == 2:
-            output = output[0]
-        # Unpad (undo num_token_padding)
-        output = torch.narrow(output, 0, 0, input_2d.shape[0])
-        x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
-
-        # DQ
-        # C = sw * sx * (X * W) + bias
-        output = output * x_scale * weight_scale.t()
-        if bias is not None:
-            output = output + bias
-        return output.to(dtype=input.dtype).view(*output_shape)
+        else:
+            # Fallback for channelwise case, where we use unfused DQ
+            # due to limitations with scaled_mm
+
+            # Symmetric quantized GEMM by definition computes the following:
+            #   C = (s_x * X) (s_w * W) + bias
+            # This is equivalent to dequantizing the weights and activations
+            # before applying a GEMM.
+            #
+            # In order to compute quantized operands, a quantized kernel
+            # will rewrite the above like so:
+            #   C = s_w * s_x * (X * W) + bias
+            #
+            # For the scaled_mm fallback case, we break this down, since it
+            # does not support s_w being a vector.
+
+            # Making sure the dummy tensor is on the same device as the weight
+            global TORCH_DEVICE_IDENTITY
+            if TORCH_DEVICE_IDENTITY.device != weight.device:
+                TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device)
+
+            # GEMM
+            # This computes C = (X * W).
+            # Output in fp32 to allow subsequent ops to happen in-place
+            output = torch._scaled_mm(
+                qinput,
+                weight,
+                scale_a=TORCH_DEVICE_IDENTITY,
+                scale_b=TORCH_DEVICE_IDENTITY,
+                out_dtype=torch.float32,
+            )
+            # A fix for discrepancy in scaled_mm which returns tuple
+            # for torch < 2.5 and a single value in torch >= 2.5
+            if type(output) is tuple and len(output) == 2:
+                output = output[0]
+            # Unpad (undo num_token_padding)
+            output = torch.narrow(output, 0, 0, input_2d.shape[0])
+            x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
+
+            # DQ
+            # C = sw * sx * (X * W) + bias
+            output = output * x_scale * weight_scale.t()
+            if bias is not None:
+                output = output + bias
+            return output.to(dtype=input.dtype).view(*output_shape)
@@ -6,7 +6,6 @@
 
 from sglang.srt.layers.linear import LinearBase
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
-from sglang.srt.layers.quantization.utils import scalar_types
 from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
 from sglang.srt.utils import is_cuda
 
@@ -133,11 +132,16 @@ def get_quant_method(
 class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
-    # (num_bits, is_sym) -> quant_type
-    TYPE_MAP = {
-        (4, True): scalar_types.uint4b8,
-        (8, True): scalar_types.uint8b128,
-    }
+    if VLLM_AVAILABLE:
+        from vllm.scalar_type import scalar_types
+
+        # (num_bits, is_sym) -> quant_type
+        TYPE_MAP = {
+            (4, True): scalar_types.uint4b8,
+            (8, True): scalar_types.uint8b128,
+        }
+    else:
+        raise ImportError("vllm is not installed")
 
     def __init__(
         self,