Skip to content

Commit 16b0bbf

Browse files
authored
matmul performance regression pipeline (#647)
* Added regression tests to tune_gemm * Add regression tests to pipelines * Add missing imports * Use warnings to signal that no performance comparison is found * Split regression tests into separate file * Disable github pipeline in favour of jenkins * Improve output and skip tests if no performance reference can be found * Add testcase for overall mean regression * Extend parameters which can be adjusted for perf regression tests * Switch to geo mean for overall result * Always recompile kernels in perf regression tests in case the user does no specify otherwise * Report default values in exported result to support changing them in the future
1 parent b6633f3 commit 16b0bbf

File tree

1 file changed

+171
-0
lines changed

1 file changed

+171
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import tune_gemm
2+
3+
import os
4+
import yaml
5+
import pytest
6+
import warnings
7+
from copy import deepcopy
8+
import statistics
9+
10+
11+
class TestRegression:
12+
13+
@classmethod
14+
def setup_class(self):
15+
self.slowdown_threshold = 0.98
16+
17+
self.test_results = []
18+
self.test_perf_ratios = []
19+
try:
20+
with open('gemm-performance-report-reference.yaml', 'r') as ref_file:
21+
self.reference_data = yaml.safe_load(ref_file)
22+
except FileNotFoundError:
23+
warnings.warn("No reference file found. There will be no regression detected!")
24+
self.reference_data = []
25+
26+
@classmethod
27+
def teardown_class(self):
28+
with open('gemm-performance-report.yaml', 'w') as out_file:
29+
yaml.safe_dump(self.test_results, out_file)
30+
31+
@pytest.mark.parametrize('config', [
32+
# M // BLOCK_M * N // BLOCK_N % 304 == 0
33+
# 1 workgroup / CU
34+
{
35+
'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
36+
256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
37+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
38+
},
39+
{
40+
'M': 4864, 'N': 4096, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
41+
256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
42+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
43+
},
44+
{
45+
'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
46+
256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
47+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
48+
},
49+
{
50+
'M': 4864, 'N': 4096, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
51+
256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
52+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
53+
},
54+
# 1 workgroup / CU masked loadK
55+
{
56+
'M': 4864, 'N': 4096, 'K': 4097, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
57+
256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
58+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
59+
},
60+
{
61+
'M': 4864, 'N': 4096, 'K': 4098, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
62+
256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
63+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
64+
},
65+
{
66+
'M': 4864, 'N': 4096, 'K': 4100, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
67+
256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
68+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
69+
},
70+
{
71+
'M': 4864, 'N': 4096, 'K': 4104, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
72+
256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
73+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
74+
},
75+
{
76+
'M': 4864, 'N': 4096, 'K': 4112, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
77+
256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
78+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
79+
},
80+
81+
# 2 workgroups / CU
82+
{
83+
'M': 4864, 'N': 8192, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
84+
256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
85+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
86+
},
87+
{
88+
'M': 4864, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
89+
256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
90+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
91+
},
92+
{
93+
'M': 4864, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
94+
256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
95+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
96+
},
97+
{
98+
'M': 4864, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N':
99+
256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu':
100+
0, 'matrix_instr_nonkdim': 16, 'kpack': 2
101+
},
102+
], ids=lambda val: f"Config: {val}")
103+
def test_matmul_performance_regression(self, config, record_property):
104+
105+
M, N, K, col_a, col_b, runConfig = tune_gemm.process_item(deepcopy(config))
106+
107+
rotating_buffer_size = config.setdefault('rotating_buffer_size', 0)
108+
icache_flush = config.setdefault('icache_flush', False)
109+
iters = config.setdefault('iters', 200)
110+
init_type = config.setdefault('init_type', 'randn')
111+
112+
dtype_a = config.setdefault('dtype_a', 'fp16')
113+
dtype_b = config.setdefault('dtype_b', 'fp16')
114+
dtype_c = config.setdefault('dtype_c', 'fp16')
115+
116+
bias_vector = config.get('bias_vector', False)
117+
bias_size = M if bias_vector else 0
118+
119+
# Always compile if the user did not specify
120+
os.environ.setdefault('TRITON_ALWAYS_COMPILE', '1')
121+
122+
tune_gemm.run_bash_command(f"rm -rf {tune_gemm.get_filename_myKernels()}")
123+
tune_gemm.generate_matmul_kernels([runConfig])
124+
125+
gpus = [0]
126+
jobs = 1
127+
benchmark = True
128+
skipWarmup = False
129+
num_threads = 32
130+
verbose_level = 0
131+
132+
minTime, bestConfig, compile_time, profile_time, post_time = tune_gemm.tune_gemm_config(
133+
M, N, K, col_a, col_b, dtype_a, dtype_b, dtype_c, init_type, [runConfig], benchmark, jobs, iters,
134+
skipWarmup=skipWarmup, num_threads=num_threads, gpus=gpus, verbose=verbose_level,
135+
rotating_buffer_size=rotating_buffer_size, bias_size=bias_size, icache_flush=icache_flush)
136+
137+
# post processing the numbers
138+
perf_tflops = lambda us: 2 * M * N * K * 1e-12 / (us * 1e-6)
139+
tri_tflops = perf_tflops(minTime)
140+
141+
record_property("TFlops", f"{tri_tflops:.2f}")
142+
record_property("MinTime", f"{minTime:.2f}")
143+
144+
# Add to global results
145+
self.test_results.append({'config': config, 'tflops': float(tri_tflops)})
146+
147+
# Look for reference run
148+
reference_run = None
149+
for run in self.reference_data:
150+
if run['config'] == config:
151+
reference_run = run
152+
break
153+
154+
if reference_run is not None:
155+
performance_ratio = tri_tflops / reference_run['tflops']
156+
self.test_perf_ratios.append(performance_ratio)
157+
regression_percent = (100.0 * (1.0 - performance_ratio))
158+
record_property("Performance difference (lower is better)", f"{regression_percent:.2f}%")
159+
assert performance_ratio > self.slowdown_threshold, f'Performance regressed by {regression_percent:.2f}% (threshold={((1.0 - self.slowdown_threshold) * 100.0 ):.2f}%)'
160+
else:
161+
pytest.skip("No performance reference found!")
162+
163+
def test_overall_performance_difference(self, record_property):
164+
if len(self.test_perf_ratios) < 2:
165+
pytest.skip("Overall results will be tested if test count > 2")
166+
167+
perf_diff_mean = statistics.geometric_mean(self.test_perf_ratios)
168+
regression_percent = (100.0 * (1.0 - perf_diff_mean))
169+
170+
record_property("Overall performance difference (mean)", f"{regression_percent:.2f}%")
171+
assert perf_diff_mean > self.slowdown_threshold, f'Performance regressed by {regression_percent:.2f}% (threshold={((1.0 - self.slowdown_threshold) * 100.0 ):.2f}%)'

0 commit comments

Comments
 (0)