Summary:

namgyu-youn · namgyu-youn · commit 50ac2cc2eb6a · 2025-09-18T02:12:38.000+09:00
This PR adds sparsify overhead benchmark, omitted in ICLR workshop paper: https://arxiv.org/abs/2503.16672 In the paper, there are two parts for the benchmark: 1) Sparsify operation overhead, 2) Sparse-GEMM kernel performance. Part 1) was omitted from the original benchmark, so this PR adds the missing sparsify-only benchmark comparing `torchao.sparse24_sm90_sparsify` against `torch._cslt_compress` (cuSPASRELt) baseline. Test plan: CI
diff --git a/benchmarks/benchmark_e2e_fp8_sparse_linear.py b/benchmarks/benchmark_e2e_fp8_sparse_linear.py
@@ -40,6 +40,20 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
     input_tensor = torch.randn(num_tokens, hidden_size).to(torch.bfloat16).cuda()
     fp16_time = benchmark_microseconds(ffn_ref, input_tensor)
 
+    # Sparsify-only benchmarks
+    X_scale = torch.empty([num_tokens, 1], device="cuda", dtype=torch.float32)
+    ao_cusparse_time = benchmark_microseconds(
+        lambda: torch.ops.torchao.sparse24_sm90_sparsify(
+            input_tensor,
+            "cutlass",
+            "srelu",
+            "largest",
+            dtype=torch.float8_e4m3fn,
+            scale=X_scale,
+        )
+    )
+    cusparse_time = benchmark_microseconds(lambda: torch._cslt_compress(input_tensor))
+
     # bf16
     ffn_clone = (
         nn.Sequential(
@@ -117,7 +131,10 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
         "fp8_c_time (us)": fp8_c_time,
         "fp8_c_sparse_time (us)": fp8_c_sparse_time,
         "fp8_c_activation_sparse_time (us)": fp8_c_activation_sparse_time,
+        "ao_cusparse_time (us)": ao_cusparse_time,
+        "cusparse_compress_time (us)": cusparse_time,
         "speedup": fp8_c_time / fp8_c_activation_sparse_time,
+        "sparsify_speedup": cusparse_time / ao_cusparse_time,
     }