From 3105cb8de64b9078e6d503b16e03f7fc2f35e9bd Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Thu, 14 Dec 2023 10:56:42 -0800
Subject: [PATCH] Use max clock for analytical calculations of peak flops

Summary: By reading the current clock, our analytical calculations can vary
while we're evaluating different configs.  It turns out the choice of config is
very sensitive to the clock, such that a slight throttling can make us reject
very good configs, in favor of very bad ones.

A reproducer can be found here:
https://gist.github.com/bertmaher/8ff5e9631666846fff55d81326cacb4d

```
$ python thermal_throttle.py
chosen config BLOCK_M: 128, BLOCK_N: 256, BLOCK_K: 32, SPLIT_K: 1, num_warps: 8, num_ctas: 1, num_stages: 3, enable_warp_specialization: False, enable_persistent: False
tflops/s: 107.92460196062149

$ python thermal_throttle.py --preheat
chosen config BLOCK_M: 32, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 6, enable_warp_specialization: False, enable_persistent: False
tflops/s: 39.29629633970286
```
---
 python/triton/ops/matmul_perf_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/triton/ops/matmul_perf_model.py b/python/triton/ops/matmul_perf_model.py
index 1e07b0a029bb..19e93268ec0b 100644
--- a/python/triton/ops/matmul_perf_model.py
+++ b/python/triton/ops/matmul_perf_model.py
@@ -12,7 +12,7 @@ def get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype):
     ''' return compute throughput in TOPS '''
     total_warps = num_ctas * min(num_warps, 4)
     num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4  # on recent GPUs
-    cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
+    cur_sm_clock = nvsmi(['clocks.max.sm'])[0]
     tflops = min(num_subcores, total_warps) / num_subcores * get_max_tensorcore_tflops(
         dtype, cur_sm_clock, backend, device)
     return tflops
@@ -22,7 +22,7 @@ def get_simd_tflops(backend, device, num_ctas, num_warps, dtype):
     ''' return compute throughput in TOPS '''
     total_warps = num_ctas * min(num_warps, 4)
     num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4  # on recent GPUs
-    cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
+    cur_sm_clock = nvsmi(['clocks.max.sm'])[0]
     tflops = min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, cur_sm_clock, backend, device)
     return tflops