diff --git a/python/triton/ops/matmul_perf_model.py b/python/triton/ops/matmul_perf_model.py index 1e07b0a029bb..19e93268ec0b 100644 --- a/python/triton/ops/matmul_perf_model.py +++ b/python/triton/ops/matmul_perf_model.py @@ -12,7 +12,7 @@ def get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype): ''' return compute throughput in TOPS ''' total_warps = num_ctas * min(num_warps, 4) num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs - cur_sm_clock = nvsmi(['clocks.current.sm'])[0] + cur_sm_clock = nvsmi(['clocks.max.sm'])[0] tflops = min(num_subcores, total_warps) / num_subcores * get_max_tensorcore_tflops( dtype, cur_sm_clock, backend, device) return tflops @@ -22,7 +22,7 @@ def get_simd_tflops(backend, device, num_ctas, num_warps, dtype): ''' return compute throughput in TOPS ''' total_warps = num_ctas * min(num_warps, 4) num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs - cur_sm_clock = nvsmi(['clocks.current.sm'])[0] + cur_sm_clock = nvsmi(['clocks.max.sm'])[0] tflops = min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, cur_sm_clock, backend, device) return tflops