From 3105cb8de64b9078e6d503b16e03f7fc2f35e9bd Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Thu, 14 Dec 2023 10:56:42 -0800 Subject: [PATCH] Use max clock for analytical calculations of peak flops Summary: By reading the current clock, our analytical calculations can vary while we're evaluating different configs. It turns out the choice of config is very sensitive to the clock, such that a slight throttling can make us reject very good configs, in favor of very bad ones. A reproducer can be found here: https://gist.github.com/bertmaher/8ff5e9631666846fff55d81326cacb4d ``` $ python thermal_throttle.py chosen config BLOCK_M: 128, BLOCK_N: 256, BLOCK_K: 32, SPLIT_K: 1, num_warps: 8, num_ctas: 1, num_stages: 3, enable_warp_specialization: False, enable_persistent: False tflops/s: 107.92460196062149 $ python thermal_throttle.py --preheat chosen config BLOCK_M: 32, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 6, enable_warp_specialization: False, enable_persistent: False tflops/s: 39.29629633970286 ``` --- python/triton/ops/matmul_perf_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/triton/ops/matmul_perf_model.py b/python/triton/ops/matmul_perf_model.py index 1e07b0a029bb..19e93268ec0b 100644 --- a/python/triton/ops/matmul_perf_model.py +++ b/python/triton/ops/matmul_perf_model.py @@ -12,7 +12,7 @@ def get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype): ''' return compute throughput in TOPS ''' total_warps = num_ctas * min(num_warps, 4) num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs - cur_sm_clock = nvsmi(['clocks.current.sm'])[0] + cur_sm_clock = nvsmi(['clocks.max.sm'])[0] tflops = min(num_subcores, total_warps) / num_subcores * get_max_tensorcore_tflops( dtype, cur_sm_clock, backend, device) return tflops @@ -22,7 +22,7 @@ def get_simd_tflops(backend, device, num_ctas, num_warps, dtype): ''' return compute throughput in TOPS ''' total_warps = num_ctas * min(num_warps, 4) num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs - cur_sm_clock = nvsmi(['clocks.current.sm'])[0] + cur_sm_clock = nvsmi(['clocks.max.sm'])[0] tflops = min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, cur_sm_clock, backend, device) return tflops