1
+ /*
2
+ * Copyright (c) 2022 NECSTLab, Politecnico di Milano. All rights reserved.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions
6
+ * are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of NECSTLab nor the names of its
13
+ * contributors may be used to endorse or promote products derived
14
+ * from this software without specific prior written permission.
15
+ * * Neither the name of Politecnico di Milano nor the names of its
16
+ * contributors may be used to endorse or promote products derived
17
+ * from this software without specific prior written permission.
18
+ *
19
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
20
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
27
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
+ */
31
+
32
+ package it .necst .grcuda .benchmark .bench ;
33
+
34
+ import it .necst .grcuda .benchmark .Benchmark ;
35
+ import it .necst .grcuda .benchmark .BenchmarkConfig ;
36
+ import org .graalvm .polyglot .Value ;
37
+
38
+ import java .util .Random ;
39
+
40
+ import static org .junit .Assert .assertEquals ;
41
+
42
+ public class B9M extends Benchmark {
43
+ /*
44
+ Compute the conjugate gradient algorithm on a dense symmetric matrix.
45
+ The matrix-vector multiplications are row-partitioned to scale across multiple GPUs;
46
+ */
47
+
48
+ private static final String PRECONDITION_KERNEL = "" +
49
+ "// Add a small epsilon to the main diagonal:\n " +
50
+ "extern \" C\" __global__ void precondition(float *A, int n, int m, int offset) {\n " +
51
+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) {\n " +
52
+ " A[i * n + i + offset] += 1e-12; \n " +
53
+ " }\n " +
54
+ "}" ;
55
+
56
+ private static final String MMUL_KERNEL = "" +
57
+ "// z = x @ y;\n " +
58
+ "extern \" C\" __global__ void matrix_vector_mult(const float* x, const float* y, float* z, int n, int m, int z_offset) {\n " +
59
+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n " +
60
+ " float sum = 0;\n " +
61
+ " for (int j = 0; j < m; j++) { \n " +
62
+ " sum += x[i * m + j] * y[j];\n " +
63
+ " }\n " +
64
+ " z[z_offset + i] = sum;\n " +
65
+ " }\n " +
66
+ "}\n " +
67
+ "// z := w + alpha * A @ y;\n " +
68
+ "extern \" C\" __global__ void matrix_vector_mult_axpy(const float* x, const float* y, const float *w, const float alpha, float* z, int n, int m, int z_offset) {\n " +
69
+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n " +
70
+ " float sum = 0;\n " +
71
+ " for (int j = 0; j < m; j++) { \n " +
72
+ " sum += x[i * m + j] * y[j];\n " +
73
+ " }\n " +
74
+ " z[z_offset + i] = alpha * sum + w[z_offset + i];\n " +
75
+ " }\n " +
76
+ "}" ;
77
+
78
+ private static final String DP_KERNEL = "" +
79
+ "__inline__ __device__ float warp_reduce(float val) {\n " +
80
+ " int warp_size = 32;\n " +
81
+ " for (int offset = warp_size / 2; offset > 0; offset /= 2) \n " +
82
+ " val += __shfl_down_sync(0xFFFFFFFF, val, offset);\n " +
83
+ " return val;\n " +
84
+ "}\n " +
85
+ "// z = <x, x>;\n " +
86
+ "extern \" C\" __global__ void l2_norm(const float *x, float* z, int N) {\n " +
87
+ " int warp_size = 32;\n " +
88
+ " float sum = float(0);\n " +
89
+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n " +
90
+ " float x_tmp = x[i];\n " +
91
+ " sum += x_tmp * x_tmp;\n " +
92
+ " }\n " +
93
+ " sum = warp_reduce(sum); // Obtain the sum of values in the current warp;\n " +
94
+ " if ((threadIdx.x & (warp_size - 1)) == 0) // Same as (threadIdx.x % warp_size) == 0 but faster\n " +
95
+ " atomicAdd(z, sum); // The first thread in the warp updates the output;\n " +
96
+ "}\n " +
97
+ "// z = <x, y>;\n " +
98
+ "extern \" C\" __global__ void dot(const float *x, const float *y, float* z, int N) {\n " +
99
+ " int warp_size = 32;\n " +
100
+ " float sum = float(0);\n " +
101
+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n " +
102
+ " sum += x[i] * y[i];\n " +
103
+ " }\n " +
104
+ " sum = warp_reduce(sum); // Obtain the sum of values in the current warp;\n " +
105
+ " if ((threadIdx.x & (warp_size - 1)) == 0) // Same as (threadIdx.x % warp_size) == 0 but faster\n " +
106
+ " atomicAdd(z, sum); // The first thread in the warp updates the output;\n " +
107
+ "}" ;
108
+
109
+ private static final String SAXPY_KERNEL = "" +
110
+ "// y = val + alpha * x;\n " +
111
+ "extern \" C\" __global__ void saxpy(float* y, const float *val, const float *x, float alpha, int n) {\n " +
112
+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n " +
113
+ " y[i] = val[i] + alpha * x[i];\n " +
114
+ " }\n " +
115
+ "}\n " +
116
+ "// Simply copy array x into y;\n " +
117
+ "extern \" C\" __global__ void cpy(float *y, const float *x, int n) {\n " +
118
+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n " +
119
+ " y[i] = x[i];\n " +
120
+ " }\n " +
121
+ "}" ;
122
+
123
+ private Value precondition_kernel , mmul_kernel , mmul_axpy_kernel , l2_norm_kernel , dp_kernel , saxpy_kernel , copy_kernel , initialize_random_symmetric_matrix ;
124
+ private Value [] A ;
125
+ private Value x , b , p , r , y , t1 , t2 ;
126
+ private int S ;
127
+
128
+ private final int P = 16 ;
129
+ private final int ITER = 50 ;
130
+
131
+ public B9M (BenchmarkConfig currentConfig ) {
132
+ super (currentConfig );
133
+
134
+ this .S = 0 ;
135
+ this .A = new Value [this .P ];
136
+ for (int i = 0 ; i < this .P ; i ++) this .A [i ] = null ;
137
+ this .x = null ;
138
+ this .b = null ;
139
+ this .p = null ;
140
+ this .r = null ;
141
+ this .y = null ;
142
+ this .t1 = null ;
143
+ this .t2 = null ;
144
+
145
+ this .mmul_axpy_kernel = null ;
146
+ this .mmul_kernel = null ;
147
+ this .l2_norm_kernel = null ;
148
+ this .dp_kernel = null ;
149
+ this .saxpy_kernel = null ;
150
+ this .copy_kernel = null ;
151
+ }
152
+
153
+ @ Override
154
+ public void allocateTest (int iteration ) {
155
+ this .S = Math .floorDiv (config .size + this .P - 1 , this .P );
156
+
157
+ // Allocate vectors
158
+ for (int i = 0 ; i < this .P ; i ++)
159
+ this .A [i ] = requestArray ("float" , this .S * config .size );
160
+ this .x = requestArray ("float" , config .size );
161
+ this .b = requestArray ("float" , config .size );
162
+ this .p = requestArray ("float" , config .size );
163
+ this .r = requestArray ("float" , config .size );
164
+ this .y = requestArray ("float" , config .size );
165
+ this .t1 = requestArray ("float" , 1 );
166
+ this .t2 = requestArray ("float" , 1 );
167
+
168
+ // Build the kernels
169
+ Value buildKernel = context .eval ("grcuda" , "buildkernel" );
170
+
171
+ this .precondition_kernel = buildKernel .execute (PRECONDITION_KERNEL , "precondition" , "pointer, sint32, sint32, sint32" );
172
+ this .mmul_kernel = buildKernel .execute (MMUL_KERNEL , "matrix_vector_mult" , "const pointer, const pointer, const pointer, sint32, sint32, sint32" );
173
+ this .mmul_axpy_kernel = buildKernel .execute (MMUL_KERNEL , "matrix_vector_mult_axpy" , "const pointer, const pointer, pointer, float, const pointer, sint32, sint32, sint32" );
174
+ this .l2_norm_kernel = buildKernel .execute (DP_KERNEL , "l2_norm" , "const pointer, pointer, sint32" );
175
+ this .dp_kernel = buildKernel .execute (DP_KERNEL , "dot" , "const pointer, pointer, pointer, sint32" );
176
+ this .saxpy_kernel = buildKernel .execute (SAXPY_KERNEL , "saxpy" , "pointer, const pointer, const pointer, float, sint32" );
177
+ this .copy_kernel = buildKernel .execute (SAXPY_KERNEL , "cpy" , "pointer, pointer, sint32" );
178
+ this .initialize_random_symmetric_matrix = context .eval ("js" , "(X, S, N) => { \n " +
179
+ " for (let i = 0; i < N; i++) {\n " +
180
+ " s = (i / S) >> 0;\n " +
181
+ " k = i % S;\n " +
182
+ " Xs = X[s];\n " +
183
+ " i_N = k * N;\n " +
184
+ " for (let j = i; j < N; j++) {\n " +
185
+ " val = 2 * Math.random() - 1; \n " +
186
+ " Xs[i_N + j] = val;\n " +
187
+ " X[(j / S) >> 0][(j % S) * N + i] = val;\n " +
188
+ " }\n " +
189
+ " }}" );
190
+ }
191
+
192
+ @ Override
193
+ public void initializeTest (int iteration ) {
194
+ this .initialize_random_symmetric_matrix .execute (this .A , this .S , config .size );
195
+ }
196
+
197
+ @ Override
198
+ public void resetIteration (int iteration ) {
199
+ // Reset result
200
+ for (int i = 0 ; i < config .size ; i ++)
201
+ this .x .setArrayElement (i , 1.0 / config .size );
202
+ this .t1 .setArrayElement (0 , 0.0 );
203
+ this .t2 .setArrayElement (0 , 0.0 );
204
+ }
205
+
206
+ @ Override
207
+ public void runTest (int iteration ) {
208
+ long start_comp = System .nanoTime ();
209
+ long end ;
210
+
211
+ // Initialization phase
212
+ // precondition: A += I * np.eps;
213
+ for (int i = 0 ; i < this .P ; i ++) {
214
+ this .precondition_kernel .execute (config .numBlocks , config .blockSize1D ).
215
+ execute (this .A [i ], config .size , Math .min (this .S , config .size - i * this .S ), i * this .S );
216
+ }
217
+
218
+ // r = b - A * x
219
+ for (int i = 0 ; i < this .P ; i ++) {
220
+ this .mmul_axpy_kernel .execute (config .numBlocks , config .blockSize1D ).
221
+ execute (this .A [i ], this .x , this .b , -1 , this .r , this .S , config .size , i * this .S );
222
+ }
223
+
224
+ // p = r
225
+ this .copy_kernel .execute (config .numBlocks , config .blockSize1D ).
226
+ execute (this .p , this .r , config .size );
227
+
228
+ // t1 = r^t * r
229
+ this .l2_norm_kernel .execute (config .numBlocks , config .blockSize1D ).
230
+ execute (this .r , this .t1 , config .size );
231
+
232
+ for (int curr_iter = 0 ; curr_iter < this .ITER ; curr_iter ++) {
233
+ // t2 = p^t * A * p
234
+ for (int i = 0 ; i < this .P ; i ++) {
235
+ this .mmul_kernel .execute (config .numBlocks , config .blockSize1D ).
236
+ execute (this .A [i ], this .p , this .y , this .S , config .size , i * this .S );
237
+ }
238
+ this .dp_kernel .execute (config .numBlocks , config .blockSize1D ).
239
+ execute (this .p , this .y , this .t2 , config .size );
240
+
241
+ float alpha = this .t1 .getArrayElement (0 ).asFloat () / this .t2 .getArrayElement (0 ).asFloat ();
242
+ float old_r_norm_squared = this .t1 .getArrayElement (0 ).asFloat ();
243
+ this .t1 .setArrayElement (0 , 0 );
244
+ this .t2 .setArrayElement (0 , 0 );
245
+
246
+ // Update x: x = x + alpha * p
247
+ this .saxpy_kernel .execute (config .numBlocks , config .blockSize1D ).
248
+ execute (this .x , this .x , this .p , alpha , config .size );
249
+
250
+ // r = r - alpha * y
251
+ this .saxpy_kernel .execute (config .numBlocks , config .blockSize1D ).
252
+ execute (this .r , this .r , this .y , -1 * alpha , config .size );
253
+
254
+ // t1 = r^t * r
255
+ this .l2_norm_kernel .execute (config .numBlocks , config .blockSize1D ).
256
+ execute (this .r , this .t1 , config .size );
257
+
258
+ float beta = this .t1 .getArrayElement (0 ).asFloat () / old_r_norm_squared ;
259
+
260
+ this .saxpy_kernel .execute (config .numBlocks , config .blockSize1D ).
261
+ execute (this .p , this .r , this .p , beta , config .size );
262
+ }
263
+
264
+ // Add final sync step
265
+ float tmp = x .getArrayElement (0 ).asFloat ();
266
+ end = System .nanoTime ();
267
+
268
+ benchmarkResults .setCurrentComputationSec ((end - start_comp ) / 1000000000F );
269
+
270
+ // Compute GPU result
271
+ for (int i = 0 ; i < this .P ; i ++) {
272
+ this .mmul_axpy_kernel .execute (config .numBlocks , config .blockSize1D ).
273
+ execute (this .A [i ], this .x , this .b , -1 , this .y , Math .min (this .S , config .size - i * this .S ), config .size , i * this .S );
274
+ }
275
+
276
+ float sum = 0 ;
277
+ for (int i = 0 ; i < 10 ; i ++)
278
+ sum += this .y .getArrayElement (i ).asFloat ();
279
+
280
+ benchmarkResults .setCurrentGpuResult (0 );
281
+ }
282
+
283
+ @ Override
284
+ public void cpuValidation () {
285
+ float [][] A_cpu = new float [config .size ][config .size ];
286
+ float [] b_cpu = new float [config .size ];
287
+ float [] x_cpu_1 = new float [config .size ];
288
+ float [] x_cpu = new float [config .size ];
289
+ float [] r_cpu = new float [config .size ];
290
+ float [] p_cpu = new float [config .size ];
291
+ float [] y_cpu = new float [config .size ];
292
+ float [] tmp ;
293
+ float t1_cpu = 0 ;
294
+ float t2_cpu = 0 ;
295
+ float alpha_cpu ;
296
+ float beta_cpu ;
297
+ float t1_old_cpu ;
298
+
299
+ for (int i = 0 ; i < config .size ; i ++) x_cpu_1 [i ] = 0 ;
300
+
301
+ int p_counter ;
302
+ for (int i = 0 ; i < config .size ; i ++) {
303
+ p_counter = Math .floorDiv (i , this .S );
304
+ for (int j = 0 ; j < config .size ; j ++)
305
+ A_cpu [i ][j ] = this .A [p_counter ].getArrayElement ((i % this .S ) * config .size + j ).asFloat ();
306
+ }
307
+
308
+ // System.out.println("Matrix test A-CPU");
309
+ // System.out.println("Matrix A-CPU -> rowSize: " + A_cpu.length + "; colSize: " + A_cpu[0].length);
310
+ // for (int r=0; r<config.size; r++) {
311
+ // System.out.print('|');
312
+ // for (int c=0; c<config.size; c++) {
313
+ // System.out.print(A_cpu[r][c] + "\t| ");
314
+ // }
315
+ // System.out.print('\n');
316
+ // }
317
+
318
+ Random rd = new Random ();
319
+ for (int i = 0 ; i < config .size ; i ++) b_cpu [i ] = rd .nextFloat ();
320
+
321
+ for (int i = 0 ; i < config .size ; i ++) x_cpu [i ] = 1 ;
322
+
323
+ tmp = matrixMult (A_cpu , x_cpu );
324
+ for (int i = 0 ; i < config .size ; i ++) r_cpu [i ] = b_cpu [i ] - tmp [i ];
325
+
326
+ for (int i = 0 ; i < config .size ; i ++) p_cpu [i ] = r_cpu [i ];
327
+
328
+ for (int i = 0 ; i < config .size ; i ++) t1_cpu += (r_cpu [i ] * r_cpu [i ]);
329
+
330
+ // Main iteration
331
+ for (int i = 0 ; i < ITER ; i ++) {
332
+ y_cpu = matrixMult (A_cpu , p_cpu );
333
+
334
+ for (int j = 0 ; j < config .size ; j ++) t2_cpu += (p_cpu [j ] * y_cpu [j ]);
335
+
336
+ alpha_cpu = t1_cpu / t2_cpu ;
337
+ t1_old_cpu = t1_cpu ;
338
+ for (int j = 0 ; j < config .size ; j ++){
339
+ x_cpu [j ] += alpha_cpu * p_cpu [j ];
340
+ r_cpu [j ] -= alpha_cpu * y_cpu [j ];
341
+ }
342
+
343
+ for (int j = 0 ; j < config .size ; j ++) t1_cpu += (r_cpu [j ] * r_cpu [j ]);
344
+
345
+ beta_cpu = t1_cpu / t1_old_cpu ;
346
+
347
+ for (int j = 0 ; j < config .size ; j ++) p_cpu [j ] = r_cpu [j ] + beta_cpu * p_cpu [j ];
348
+ }
349
+
350
+ // System.out.println(" CPU - y pre sum ");
351
+ // for (int i=0; i < config.size; i++) {System.out.print(y_cpu[i] + " % ");}
352
+ // System.out.print("\n");
353
+ float sum = 0 ;
354
+ for (int i = 0 ; i < 10 ; i ++) {
355
+ sum += y_cpu [i ];
356
+ }
357
+
358
+ benchmarkResults .setCurrentCpuResult (sum );
359
+ assertEquals (benchmarkResults .currentCpuResult (), benchmarkResults .currentGpuResult (), 1e-3 );
360
+ }
361
+
362
+ private float [] matrixMult (float [][] a , float [] b ) {
363
+ float [] res = new float [a .length ];
364
+ float tempSum ;
365
+
366
+ for (int r = 0 ; r < a .length ; r ++) {
367
+ tempSum = 0 ;
368
+ for (int k = 0 ; k < b .length ; k ++) {
369
+ tempSum += a [r ][k ] * b [k ];
370
+ }
371
+ res [r ] = tempSum ;
372
+ }
373
+ return res ;
374
+ }
375
+ }
0 commit comments