neuralmagic · rahul-tuli · Oct 22, 2024 · Oct 28, 2024 · Oct 30, 2024 · Oct 30, 2024
@@ -202,18 +202,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
 
   FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
+        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
+        # GIT_SHALLOW FALSE
   )
   FetchContent_MakeAvailable(cutlass)
 
@@ -225,7 +225,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/gguf/gguf_kernel.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_compressor.cu")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -255,11 +257,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
+  # For Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+             "csrc/sparse/cutlass/sparse_compressor.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -268,12 +273,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+      message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
+                     "later if you intend on running FP8 quantized models or sparse on "
                      "Hopper.")
     else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building cutlass_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
 
@@ -398,6 +403,9 @@ define_gpu_extension_target(
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
+# include(nm_cutlass_c.cmake)
+# build_nm_cutlass_c()
+
 #
 # _moe_C extension
 #

diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -0,0 +1,311 @@
+## Cutlass benchmark V1
+
+from typing import Callable, Iterable
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+
+import vllm._custom_ops as ops
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.float16))
+
+    # cutlass with bias: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+
+    # Create tensors
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    aT = a.t()
+    bT = b.t()
+    bf16_a = a.to(dtype=torch.bfloat16)
+    bf16_bT = bT.to(dtype=torch.bfloat16)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 bT.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 bT,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 bT,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 bT,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 bT,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
+                 torch.bfloat16))
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
+
+    return timers
+
+
+def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float16
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.float16, m, n, k)
+
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # # pytorch impl w. bf16
+    # timers.append(
+    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+    #              torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+    #              b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # # pytorch impl: bf16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_bf16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.bfloat16))
+
+    # # pytorch impl: fp16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_fp16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.float16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.bfloat16
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.bfloat16, m, n, k)
+
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # # pytorch impl w. bf16
+    # timers.append(
+    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+    #              torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+    #              b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # # pytorch impl: bf16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_bf16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.bfloat16))
+
+    # # pytorch impl: fp16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_fp16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.float16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    # if dtype == torch.int8:
+    #     return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    # if dtype == torch.float16:
+    #     return bench_fp16(dtype, m, k, n, label, sub_label)
+    # if dtype == torch.bfloat16:
+    #     return bench_bf16(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")