pytorch · jcaip · Sep 25, 2025 · Sep 17, 2025 · Sep 19, 2025 · Sep 21, 2025
diff --git a/benchmarks/benchmark_e2e_fp8_sparse_linear.py b/benchmarks/benchmark_e2e_fp8_sparse_linear.py
@@ -40,6 +40,18 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
     input_tensor = torch.randn(num_tokens, hidden_size).to(torch.bfloat16).cuda()
     fp16_time = benchmark_microseconds(ffn_ref, input_tensor)
 
+    # Sparsify-only benchmarks
+    ao_fast_sparsification_time = benchmark_microseconds(
+        torch.ops.torchao.sparse24_sm90_sparsify(
+            input_tensor,
+            "cutlass",
+            "identity",
+            "largest",
+            dtype=torch.float8_e4m3fn,
+        )
+    )
+    cusparselt_time = benchmark_microseconds(torch._cslt_compress, input_tensor)
+
     # bf16
     ffn_clone = (
         nn.Sequential(
@@ -117,7 +129,10 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
         "fp8_c_time (us)": fp8_c_time,
         "fp8_c_sparse_time (us)": fp8_c_sparse_time,
         "fp8_c_activation_sparse_time (us)": fp8_c_activation_sparse_time,
+        "ao_fast_sparsification_time (us)": ao_fast_sparsification_time,
+        "cusparselt_compress_time (us)": cusparselt_time,
         "speedup": fp8_c_time / fp8_c_activation_sparse_time,
+        "sparsify_speedup": cusparselt_time / ao_fast_sparsification_time,
     }