sgl-project · zhyncs · Jun 17, 2025 · Jun 3, 2025 · Jun 3, 2025 · Jun 3, 2025
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -2104,8 +2104,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=Fal
                         ):
                             q_a_proj_weight = cached_a_proj[q_a_proj_name]
                             kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                            cat_dim = 0
+                            if self.quant_config.get_name() == "awq" or self.quant_config.get_name() == "moe_wna16":
+                                cat_dim = 1
                             fused_weight = torch.cat(
-                                [q_a_proj_weight, kv_a_proj_weight], dim=0
+                                [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
                             )
 
                             param_name = name.replace(

@@ -130,11 +130,13 @@ __global__ void __launch_bounds__(256) dequantize_weights(
     int* __restrict__ qzeros,
     OutputT* __restrict__ output,
     int group_size,
-    int qweight_cols) {
+    int qweight_cols,
+    int qweight_rows) {
 #if CUDA_VERSION >= 12000
   int col = blockIdx.x * blockDim.x + threadIdx.x;
   int row = blockIdx.y * blockDim.y + threadIdx.y;
-
+  if (col >= qweight_cols || row >= qweight_rows) return;
+
   int group_idx = row / group_size;
   int scale_offset = 8 * col + group_idx * qweight_cols * 8;
   uint4 loaded_scale = *(uint4*)(scales + scale_offset);
@@ -188,8 +190,8 @@ torch::Tensor awq_dequantize(torch::Tensor qweight, torch::Tensor scales, torch:
 
   int x_num_threads = 16;
   int y_num_threads = 16;
-  int x_blocks = qweight_cols / x_num_threads;
-  int y_blocks = qweight_rows / y_num_threads;
+  int x_blocks = (qweight_cols + x_num_threads - 1) / x_num_threads;
+  int y_blocks = (qweight_rows + y_num_threads - 1) / y_num_threads;
 
   const at::cuda::OptionalCUDAGuard device_guard(device_of(qweight));
 
@@ -207,12 +209,12 @@ torch::Tensor awq_dequantize(torch::Tensor qweight, torch::Tensor scales, torch:
     auto _scales = reinterpret_cast<half*>(scales.data_ptr<at::Half>());
     auto _output = reinterpret_cast<half*>(output.data_ptr<at::Half>());
     dequantize_weights<half>
-        <<<num_blocks, threads_per_block, 0, stream>>>(_qweight, _scales, _zeros, _output, group_size, qweight_cols);
+        <<<num_blocks, threads_per_block, 0, stream>>>(_qweight, _scales, _zeros, _output, group_size, qweight_cols, qweight_rows);
   } else {
     auto _scales = reinterpret_cast<__nv_bfloat16*>(scales.data_ptr<at::BFloat16>());
     auto _output = reinterpret_cast<__nv_bfloat16*>(output.data_ptr<at::BFloat16>());
     dequantize_weights<__nv_bfloat16>
-        <<<num_blocks, threads_per_block, 0, stream>>>(_qweight, _scales, _zeros, _output, group_size, qweight_cols);
+        <<<num_blocks, threads_per_block, 0, stream>>>(_qweight, _scales, _zeros, _output, group_size, qweight_cols, qweight_rows);
   }
 
   return output;

@@ -67,8 +67,10 @@ def sglang_awq_dequantize(
     "qweight_row,qweight_col,is_bf16_act",
     list(
         itertools.product(
-            [3584, 18944, 128, 256, 512, 1024],
-            [448, 576, 4736, 16, 32, 64, 128],
+            # [7168, 7168, 7168, 128, 128],
+            # [264, 192, 72, 16, 24],
+            [128, 128, 128, 128],
+            [16, 24, 32, 40],
             [True, False],
         )
     ),
@@ -77,7 +79,6 @@ def test_awq_dequant_compare_implementations(
     qweight_row: int, qweight_col: int, is_bf16_act: bool
 ):
     device = torch.device("cuda")
-
     qweight = torch.randint(
         0,
         torch.iinfo(torch.int32).max,