[Kernel] Optimization of the mm_k operator.

caozuoba · caozuoba · commit 21cd26375e01 · 2025-11-07T16:01:33.000+08:00
diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py
@@ -22,6 +22,7 @@ def mm_k(
     SPLIT_K: tl.constexpr,
     CAST_TYPE: tl.constexpr,
     b_dtype: tl.constexpr,
+    base_k,
 ):
     """
     Given a_ptr and b_ptr, that identify the rows of A (m x k) and columns of
@@ -45,27 +46,57 @@ def mm_k(
         CAST_TYPE: if True, cast the values from the A matrix to the B
           matrix dtype.
         b_dtype: datatype of the B matrix
+        base_k: Base offset along K dimension for current SPLIT_K group
     """
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(tl.cdiv(K, BLOCK_K * SPLIT_K)):
+
+    # Step size along K for each iteration
+    STEP_K = BLOCK_K * SPLIT_K
+
+    # Total number of iterations (compile-time constant)
+    num_iters = tl.cdiv(K, STEP_K)
+
+    for k in range(num_iters):
+        # Current iteration's global K offset
+        iter_k = k * STEP_K + base_k
+
+        # Check if this iteration is completely valid (no masking needed)
+        block_end = iter_k + BLOCK_K
+
         if EVEN_K:
-            tiled_a = tl.load(a_ptr)
-            tiled_b = tl.load(b_ptr)
+            # K is divisible by BLOCK_K, no masking ever needed
+            # But skip if entire block is out of range
+            if iter_k < K:
+                tiled_a = tl.load(a_ptr)
+                tiled_b = tl.load(b_ptr)
+                if CAST_TYPE:
+                    tiled_a = tiled_a.to(b_dtype)
+                accumulator += tl.dot(tiled_a, tiled_b)
         else:
-            tiled_a = tl.load(
-                a_ptr, mask=offset_k[None, :] < K - k * (BLOCK_K * SPLIT_K), other=0
-            )
-            tiled_b = tl.load(
-                b_ptr, mask=offset_k[:, None] < K - k * (BLOCK_K * SPLIT_K), other=0
-            )
-        if CAST_TYPE:
-            tiled_a = tiled_a.to(b_dtype)
-        accumulator += tl.dot(
-            tiled_a,
-            tiled_b,
-        )
-        a_ptr += BLOCK_K * SPLIT_K * ak_stride
-        b_ptr += BLOCK_K * SPLIT_K * bk_stride
+            # Check if we need element-wise masking
+            if iter_k >= K:
+                # Entire block out of range, skip
+                pass
+            elif block_end <= K:
+                # Entire block in range, no masking needed (fast path)
+                tiled_a = tl.load(a_ptr)
+                tiled_b = tl.load(b_ptr)
+                if CAST_TYPE:
+                    tiled_a = tiled_a.to(b_dtype)
+                accumulator += tl.dot(tiled_a, tiled_b)
+            else:
+                # Partial block, need masking (only last iteration)
+                k_offsets = tl.arange(0, BLOCK_K)
+                mask = iter_k + k_offsets < K
+                tiled_a = tl.load(a_ptr, mask=mask[None, :], other=0.0)
+                tiled_b = tl.load(b_ptr, mask=mask[:, None], other=0.0)
+                if CAST_TYPE:
+                    tiled_a = tiled_a.to(b_dtype)
+                accumulator += tl.dot(tiled_a, tiled_b)
+
+        a_ptr += STEP_K * ak_stride
+        b_ptr += STEP_K * bk_stride
+
     return accumulator
 
 
@@ -168,6 +199,7 @@ def do_expand_kernel(
         SPLIT_K,
         CAST_TYPE,
         cur_lora_ptr.dtype.element_ty,
+        base_k=0,
     )
 
     tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
@@ -272,6 +304,7 @@ def do_shrink_kernel(
         SPLIT_K,
         False,
         cur_lora_ptr.dtype.element_ty,
+        base_k=pid_sk * BLOCK_K,
     )
 
     # Identify the C output pointers to store the results of the accumulator.