@@ -41,18 +41,16 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
4141 fp16_time = benchmark_microseconds (ffn_ref , input_tensor )
4242
4343 # Sparsify-only benchmarks
44- X_scale = torch .empty ([num_tokens , 1 ], device = "cuda" , dtype = torch .float32 )
45- ao_cusparse_time = benchmark_microseconds (
46- lambda : torch .ops .torchao .sparse24_sm90_sparsify (
44+ ao_fast_sparsification_time = benchmark_microseconds (
45+ torch .ops .torchao .sparse24_sm90_sparsify (
4746 input_tensor ,
4847 "cutlass" ,
4948 "srelu" ,
5049 "largest" ,
5150 dtype = torch .float8_e4m3fn ,
52- scale = X_scale ,
5351 )
5452 )
55- cusparse_time = benchmark_microseconds (lambda : torch ._cslt_compress ( input_tensor ) )
53+ cusparse_time = benchmark_microseconds (torch ._cslt_compress , input_tensor )
5654
5755 # bf16
5856 ffn_clone = (
@@ -131,10 +129,10 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
131129 "fp8_c_time (us)" : fp8_c_time ,
132130 "fp8_c_sparse_time (us)" : fp8_c_sparse_time ,
133131 "fp8_c_activation_sparse_time (us)" : fp8_c_activation_sparse_time ,
134- "ao_cusparse_time (us)" : ao_cusparse_time ,
135- "cusparse_compress_time (us)" : cusparse_time ,
132+ "ao_fast_sparsification_time (us)" : ao_fast_sparsification_time ,
133+ "cusparse*_compress_time (us)" : cusparse_time ,
136134 "speedup" : fp8_c_time / fp8_c_activation_sparse_time ,
137- "sparsify_speedup" : cusparse_time / ao_cusparse_time ,
135+ "sparsify_speedup" : cusparse_time / ao_fast_sparsification_time ,
138136 }
139137
140138
0 commit comments