Skip to content

Commit ca826df

Browse files
authored
[GRCUDA-hotfix] move B9M into the Java benchmark suite (#52)
* Add B9M + updated config files * Minor fix - deleted replicated B9M entry
1 parent 84b07b4 commit ca826df

File tree

3 files changed

+387
-4
lines changed

3 files changed

+387
-4
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,375 @@
1+
/*
2+
* Copyright (c) 2022 NECSTLab, Politecnico di Milano. All rights reserved.
3+
*
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions
6+
* are met:
7+
* * Redistributions of source code must retain the above copyright
8+
* notice, this list of conditions and the following disclaimer.
9+
* * Redistributions in binary form must reproduce the above copyright
10+
* notice, this list of conditions and the following disclaimer in the
11+
* documentation and/or other materials provided with the distribution.
12+
* * Neither the name of NECSTLab nor the names of its
13+
* contributors may be used to endorse or promote products derived
14+
* from this software without specific prior written permission.
15+
* * Neither the name of Politecnico di Milano nor the names of its
16+
* contributors may be used to endorse or promote products derived
17+
* from this software without specific prior written permission.
18+
*
19+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
20+
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22+
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23+
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
27+
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30+
*/
31+
32+
package it.necst.grcuda.benchmark.bench;
33+
34+
import it.necst.grcuda.benchmark.Benchmark;
35+
import it.necst.grcuda.benchmark.BenchmarkConfig;
36+
import org.graalvm.polyglot.Value;
37+
38+
import java.util.Random;
39+
40+
import static org.junit.Assert.assertEquals;
41+
42+
public class B9M extends Benchmark {
43+
/*
44+
Compute the conjugate gradient algorithm on a dense symmetric matrix.
45+
The matrix-vector multiplications are row-partitioned to scale across multiple GPUs;
46+
*/
47+
48+
private static final String PRECONDITION_KERNEL = "" +
49+
"// Add a small epsilon to the main diagonal:\n" +
50+
"extern \"C\" __global__ void precondition(float *A, int n, int m, int offset) {\n" +
51+
" for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) {\n" +
52+
" A[i * n + i + offset] += 1e-12; \n" +
53+
" }\n" +
54+
"}";
55+
56+
private static final String MMUL_KERNEL = "" +
57+
"// z = x @ y;\n" +
58+
"extern \"C\" __global__ void matrix_vector_mult(const float* x, const float* y, float* z, int n, int m, int z_offset) {\n" +
59+
" for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n" +
60+
" float sum = 0;\n" +
61+
" for (int j = 0; j < m; j++) { \n" +
62+
" sum += x[i * m + j] * y[j];\n" +
63+
" }\n" +
64+
" z[z_offset + i] = sum;\n" +
65+
" }\n" +
66+
"}\n" +
67+
"// z := w + alpha * A @ y;\n" +
68+
"extern \"C\" __global__ void matrix_vector_mult_axpy(const float* x, const float* y, const float *w, const float alpha, float* z, int n, int m, int z_offset) {\n" +
69+
" for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n" +
70+
" float sum = 0;\n" +
71+
" for (int j = 0; j < m; j++) { \n" +
72+
" sum += x[i * m + j] * y[j];\n" +
73+
" }\n" +
74+
" z[z_offset + i] = alpha * sum + w[z_offset + i];\n" +
75+
" }\n" +
76+
"}";
77+
78+
private static final String DP_KERNEL = "" +
79+
"__inline__ __device__ float warp_reduce(float val) {\n" +
80+
" int warp_size = 32;\n" +
81+
" for (int offset = warp_size / 2; offset > 0; offset /= 2) \n" +
82+
" val += __shfl_down_sync(0xFFFFFFFF, val, offset);\n" +
83+
" return val;\n" +
84+
"}\n" +
85+
"// z = <x, x>;\n" +
86+
"extern \"C\" __global__ void l2_norm(const float *x, float* z, int N) {\n" +
87+
" int warp_size = 32;\n" +
88+
" float sum = float(0);\n" +
89+
" for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n" +
90+
" float x_tmp = x[i];\n" +
91+
" sum += x_tmp * x_tmp;\n" +
92+
" }\n" +
93+
" sum = warp_reduce(sum); // Obtain the sum of values in the current warp;\n" +
94+
" if ((threadIdx.x & (warp_size - 1)) == 0) // Same as (threadIdx.x % warp_size) == 0 but faster\n" +
95+
" atomicAdd(z, sum); // The first thread in the warp updates the output;\n" +
96+
"}\n" +
97+
"// z = <x, y>;\n" +
98+
"extern \"C\" __global__ void dot(const float *x, const float *y, float* z, int N) {\n" +
99+
" int warp_size = 32;\n" +
100+
" float sum = float(0);\n" +
101+
" for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n" +
102+
" sum += x[i] * y[i];\n" +
103+
" }\n" +
104+
" sum = warp_reduce(sum); // Obtain the sum of values in the current warp;\n" +
105+
" if ((threadIdx.x & (warp_size - 1)) == 0) // Same as (threadIdx.x % warp_size) == 0 but faster\n" +
106+
" atomicAdd(z, sum); // The first thread in the warp updates the output;\n" +
107+
"}";
108+
109+
private static final String SAXPY_KERNEL = "" +
110+
"// y = val + alpha * x;\n" +
111+
"extern \"C\" __global__ void saxpy(float* y, const float *val, const float *x, float alpha, int n) {\n" +
112+
" for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n" +
113+
" y[i] = val[i] + alpha * x[i];\n" +
114+
" }\n" +
115+
"}\n" +
116+
"// Simply copy array x into y;\n" +
117+
"extern \"C\" __global__ void cpy(float *y, const float *x, int n) {\n" +
118+
" for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n" +
119+
" y[i] = x[i];\n" +
120+
" }\n" +
121+
"}";
122+
123+
private Value precondition_kernel, mmul_kernel, mmul_axpy_kernel, l2_norm_kernel, dp_kernel, saxpy_kernel, copy_kernel, initialize_random_symmetric_matrix;
124+
private Value[] A;
125+
private Value x, b, p, r, y, t1, t2;
126+
private int S;
127+
128+
private final int P = 16;
129+
private final int ITER = 50;
130+
131+
public B9M(BenchmarkConfig currentConfig) {
132+
super(currentConfig);
133+
134+
this.S = 0;
135+
this.A = new Value[this.P];
136+
for (int i = 0; i < this.P; i ++) this.A[i] = null;
137+
this.x = null;
138+
this.b = null;
139+
this.p = null;
140+
this.r = null;
141+
this.y = null;
142+
this.t1 = null;
143+
this.t2 = null;
144+
145+
this.mmul_axpy_kernel = null;
146+
this.mmul_kernel = null;
147+
this.l2_norm_kernel = null;
148+
this.dp_kernel = null;
149+
this.saxpy_kernel = null;
150+
this.copy_kernel = null;
151+
}
152+
153+
@Override
154+
public void allocateTest(int iteration) {
155+
this.S = Math.floorDiv(config.size + this.P - 1, this.P);
156+
157+
// Allocate vectors
158+
for (int i = 0; i < this.P; i++)
159+
this.A[i] = requestArray("float", this.S * config.size);
160+
this.x = requestArray("float", config.size);
161+
this.b = requestArray("float", config.size);
162+
this.p = requestArray("float", config.size);
163+
this.r = requestArray("float", config.size);
164+
this.y = requestArray("float", config.size);
165+
this.t1 = requestArray("float", 1);
166+
this.t2 = requestArray("float", 1);
167+
168+
// Build the kernels
169+
Value buildKernel = context.eval("grcuda", "buildkernel");
170+
171+
this.precondition_kernel = buildKernel.execute(PRECONDITION_KERNEL, "precondition", "pointer, sint32, sint32, sint32");
172+
this.mmul_kernel = buildKernel.execute(MMUL_KERNEL, "matrix_vector_mult", "const pointer, const pointer, const pointer, sint32, sint32, sint32");
173+
this.mmul_axpy_kernel = buildKernel.execute(MMUL_KERNEL, "matrix_vector_mult_axpy", "const pointer, const pointer, pointer, float, const pointer, sint32, sint32, sint32");
174+
this.l2_norm_kernel = buildKernel.execute(DP_KERNEL, "l2_norm", "const pointer, pointer, sint32");
175+
this.dp_kernel = buildKernel.execute(DP_KERNEL, "dot", "const pointer, pointer, pointer, sint32");
176+
this.saxpy_kernel = buildKernel.execute(SAXPY_KERNEL, "saxpy", "pointer, const pointer, const pointer, float, sint32");
177+
this.copy_kernel = buildKernel.execute(SAXPY_KERNEL, "cpy", "pointer, pointer, sint32");
178+
this.initialize_random_symmetric_matrix = context.eval("js", "(X, S, N) => { \n" +
179+
" for (let i = 0; i < N; i++) {\n" +
180+
" s = (i / S) >> 0;\n" +
181+
" k = i % S;\n" +
182+
" Xs = X[s];\n" +
183+
" i_N = k * N;\n" +
184+
" for (let j = i; j < N; j++) {\n" +
185+
" val = 2 * Math.random() - 1; \n" +
186+
" Xs[i_N + j] = val;\n" +
187+
" X[(j / S) >> 0][(j % S) * N + i] = val;\n" +
188+
" }\n" +
189+
" }}");
190+
}
191+
192+
@Override
193+
public void initializeTest(int iteration) {
194+
this.initialize_random_symmetric_matrix.execute(this.A, this.S, config.size);
195+
}
196+
197+
@Override
198+
public void resetIteration(int iteration) {
199+
// Reset result
200+
for (int i = 0; i < config.size; i++)
201+
this.x.setArrayElement(i, 1.0 / config.size);
202+
this.t1.setArrayElement(0, 0.0);
203+
this.t2.setArrayElement(0, 0.0);
204+
}
205+
206+
@Override
207+
public void runTest(int iteration) {
208+
long start_comp = System.nanoTime();
209+
long end;
210+
211+
// Initialization phase
212+
// precondition: A += I * np.eps;
213+
for (int i = 0; i < this.P; i++) {
214+
this.precondition_kernel.execute(config.numBlocks, config.blockSize1D).
215+
execute(this.A[i], config.size, Math.min(this.S, config.size - i * this.S), i * this.S);
216+
}
217+
218+
// r = b - A * x
219+
for (int i = 0; i < this.P; i++) {
220+
this.mmul_axpy_kernel.execute(config.numBlocks, config.blockSize1D).
221+
execute(this.A[i], this.x, this.b, -1, this.r, this.S, config.size, i * this.S);
222+
}
223+
224+
// p = r
225+
this.copy_kernel.execute(config.numBlocks, config.blockSize1D).
226+
execute(this.p, this.r, config.size);
227+
228+
// t1 = r^t * r
229+
this.l2_norm_kernel.execute(config.numBlocks, config.blockSize1D).
230+
execute(this.r, this.t1, config.size);
231+
232+
for (int curr_iter = 0; curr_iter < this.ITER; curr_iter++) {
233+
// t2 = p^t * A * p
234+
for (int i = 0; i < this.P; i++) {
235+
this.mmul_kernel.execute(config.numBlocks, config.blockSize1D).
236+
execute(this.A[i], this.p, this.y, this.S, config.size, i * this.S);
237+
}
238+
this.dp_kernel.execute(config.numBlocks, config.blockSize1D).
239+
execute(this.p, this.y, this.t2, config.size);
240+
241+
float alpha = this.t1.getArrayElement(0).asFloat() / this.t2.getArrayElement(0).asFloat();
242+
float old_r_norm_squared = this.t1.getArrayElement(0).asFloat();
243+
this.t1.setArrayElement(0, 0);
244+
this.t2.setArrayElement(0, 0);
245+
246+
// Update x: x = x + alpha * p
247+
this.saxpy_kernel.execute(config.numBlocks, config.blockSize1D).
248+
execute(this.x, this.x, this.p, alpha, config.size);
249+
250+
// r = r - alpha * y
251+
this.saxpy_kernel.execute(config.numBlocks, config.blockSize1D).
252+
execute(this.r, this.r, this.y, -1 * alpha, config.size);
253+
254+
// t1 = r^t * r
255+
this.l2_norm_kernel.execute(config.numBlocks, config.blockSize1D).
256+
execute(this.r, this.t1, config.size);
257+
258+
float beta = this.t1.getArrayElement(0).asFloat() / old_r_norm_squared;
259+
260+
this.saxpy_kernel.execute(config.numBlocks, config.blockSize1D).
261+
execute(this.p, this.r, this.p, beta, config.size);
262+
}
263+
264+
// Add final sync step
265+
float tmp = x.getArrayElement(0).asFloat();
266+
end = System.nanoTime();
267+
268+
benchmarkResults.setCurrentComputationSec((end - start_comp) / 1000000000F);
269+
270+
// Compute GPU result
271+
for (int i = 0; i < this.P; i++) {
272+
this.mmul_axpy_kernel.execute(config.numBlocks, config.blockSize1D).
273+
execute(this.A[i], this.x, this.b, -1, this.y, Math.min(this.S, config.size - i * this.S), config.size, i * this.S);
274+
}
275+
276+
float sum = 0;
277+
for (int i = 0; i < 10; i++)
278+
sum += this.y.getArrayElement(i).asFloat();
279+
280+
benchmarkResults.setCurrentGpuResult(0);
281+
}
282+
283+
@Override
284+
public void cpuValidation() {
285+
float[][] A_cpu = new float[config.size][config.size];
286+
float[] b_cpu = new float[config.size];
287+
float[] x_cpu_1 = new float[config.size];
288+
float[] x_cpu = new float[config.size];
289+
float[] r_cpu = new float[config.size];
290+
float[] p_cpu = new float[config.size];
291+
float[] y_cpu = new float[config.size];
292+
float[] tmp;
293+
float t1_cpu = 0;
294+
float t2_cpu = 0;
295+
float alpha_cpu;
296+
float beta_cpu;
297+
float t1_old_cpu;
298+
299+
for (int i = 0; i < config.size; i++) x_cpu_1[i] = 0;
300+
301+
int p_counter;
302+
for (int i = 0; i < config.size; i++) {
303+
p_counter = Math.floorDiv(i, this.S);
304+
for (int j = 0; j < config.size; j++)
305+
A_cpu[i][j] = this.A[p_counter].getArrayElement((i % this.S) * config.size + j).asFloat();
306+
}
307+
308+
// System.out.println("Matrix test A-CPU");
309+
// System.out.println("Matrix A-CPU -> rowSize: " + A_cpu.length + "; colSize: " + A_cpu[0].length);
310+
// for (int r=0; r<config.size; r++) {
311+
// System.out.print('|');
312+
// for (int c=0; c<config.size; c++) {
313+
// System.out.print(A_cpu[r][c] + "\t| ");
314+
// }
315+
// System.out.print('\n');
316+
// }
317+
318+
Random rd = new Random();
319+
for (int i = 0; i < config.size; i++) b_cpu[i] = rd.nextFloat();
320+
321+
for (int i = 0; i < config.size; i++) x_cpu[i] = 1;
322+
323+
tmp = matrixMult(A_cpu, x_cpu);
324+
for (int i = 0; i < config.size; i++) r_cpu[i] = b_cpu[i] - tmp[i];
325+
326+
for (int i = 0; i < config.size; i++) p_cpu[i] = r_cpu[i];
327+
328+
for (int i = 0; i < config.size; i++) t1_cpu += (r_cpu[i] * r_cpu[i]);
329+
330+
// Main iteration
331+
for (int i = 0; i < ITER; i++) {
332+
y_cpu = matrixMult(A_cpu, p_cpu);
333+
334+
for (int j = 0; j < config.size; j++) t2_cpu += (p_cpu[j] * y_cpu[j]);
335+
336+
alpha_cpu = t1_cpu / t2_cpu;
337+
t1_old_cpu = t1_cpu;
338+
for (int j = 0; j < config.size; j++){
339+
x_cpu[j] += alpha_cpu * p_cpu[j];
340+
r_cpu[j] -= alpha_cpu * y_cpu[j];
341+
}
342+
343+
for (int j = 0; j < config.size; j++) t1_cpu += (r_cpu[j] * r_cpu[j]);
344+
345+
beta_cpu = t1_cpu / t1_old_cpu;
346+
347+
for (int j = 0; j < config.size; j++) p_cpu[j] = r_cpu[j] + beta_cpu * p_cpu[j];
348+
}
349+
350+
// System.out.println(" CPU - y pre sum ");
351+
// for (int i=0; i < config.size; i++) {System.out.print(y_cpu[i] + " % ");}
352+
// System.out.print("\n");
353+
float sum = 0;
354+
for (int i = 0; i < 10; i++) {
355+
sum += y_cpu[i];
356+
}
357+
358+
benchmarkResults.setCurrentCpuResult(sum);
359+
assertEquals(benchmarkResults.currentCpuResult(), benchmarkResults.currentGpuResult(), 1e-3);
360+
}
361+
362+
private float[] matrixMult(float[][] a, float[] b) {
363+
float[] res = new float[a.length];
364+
float tempSum;
365+
366+
for (int r = 0; r < a.length; r++) {
367+
tempSum = 0;
368+
for (int k = 0; k < b.length; k++) {
369+
tempSum += a[r][k] * b[k];
370+
}
371+
res[r] = tempSum;
372+
}
373+
return res;
374+
}
375+
}

0 commit comments

Comments
 (0)