Skip to content

Commit

Permalink
Merge pull request #5 from giulange/master
Browse files Browse the repository at this point in the history
cuda kernel now working
  • Loading branch information
Simone Giannecchini committed Sep 25, 2013
2 parents 188d1bf + 63747b7 commit e87912c
Showing 1 changed file with 63 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,21 @@ public void run() {
assert dataRef.length==rect.width*rect.height;
assert dataCurrent.length==rect.width*rect.height;

System.out.println("Calling JCUDA tileX:"+tileX+" tileY:"+tileY);
// call CUDA and get result
// I am expecting the host_oMap as the first array and host_chMat as the second array
final List<int[]> result=JCudaChangeMat(dataRef,dataCurrent);
System.out.println("Cuda kernels work fine !!");
System.out.println("");

// build output image and save
System.out.println("build output image 1");
final BufferedImage biImage= new BufferedImage(rect.width, rect.height, BufferedImage.TYPE_BYTE_GRAY);
System.out.println("build output image 2");
final DataBufferInt dbFinal= new DataBufferInt(result.get(0),result.get(0).length);
System.out.println("build output image 3");
final Raster finalR= RasterFactory.createRaster(biImage.getSampleModel(), dbFinal, new Point(0,0));
System.out.println("build output image 4");
biImage.setData(finalR);
try {
//ImageIO.write(biImage, "tiff", new File("d:/data/unina/test/row"+tileY+"_col"+tileX+"_"+".tif"));
Expand All @@ -131,14 +138,14 @@ public void run() {
// allows the
// Java Hotspot to compile the code. Then the other NUM_CYCLES_BENCH cycles
// are calculated
private static final int NUM_CYCLES_BENCH = 3;
private static final int NUM_CYCLES_BENCH = 1;//=3
private static final int NUM_CYCLES_WARM = 1;

private RenderedOp current;

private RenderedOp reference;

private final static int DEFAULT_THREAD_NUMBER = 10;
private final static int DEFAULT_THREAD_NUMBER = 1;

@Before
public void init() {
Expand All @@ -150,9 +157,9 @@ public void init() {
* "clc2006_L3_100m.tif");
*/
final File file0 = new File(REFERENCE_PATH_FOR_TESTS,
"clc2000_L3_100m");
"clc2000_L3_100m.tif");
final File file6 = new File(REFERENCE_PATH_FOR_TESTS,
"clc2006_L3_100m");
"clc2006_L3_100m.tif");

if(!file0.exists()||!file0.canRead()||!file6.exists()||!file6.canRead()){
throw new IllegalArgumentException("Input files are not present!");
Expand Down Expand Up @@ -183,10 +190,7 @@ public void init() {
current = JAI.create("ImageRead", pbj, hints);

pbj.setParameter("Input", file6);
reference = JAI.create("ImageRead", pbj, hints);



reference = JAI.create("ImageRead", pbj, hints);
}


Expand All @@ -201,20 +205,20 @@ public void testCUDA() throws Exception {
final int minTileY=reference.getMinTileY();



//System.out.println(numTileX*numTileY);
final ExecutorService ex = Executors.newFixedThreadPool(DEFAULT_THREAD_NUMBER);
final CountDownLatch sem = new CountDownLatch(numTileX * numTileY);
System.out.println();
// cycle on tiles to call the CUDA code
for(int i=minTileY;i<minTileY+numTileY;i++){
for(int j=minTileX;j<minTileX+numTileX;j++){

ex.execute(new MyRunnable(j, i,sem));
//for(int i=minTileY;i<minTileY+numTileY;i++){
for(int i=1;i<2;i++){
//for(int j=minTileX;j<minTileX+numTileX;j++){
for(int j=1;j<2;j++){
ex.execute(new MyRunnable(j, i, sem));
}

}
sem.await(10,TimeUnit.MINUTES);
ex.shutdown();

}


Expand All @@ -224,8 +228,7 @@ public void testCUDA() throws Exception {
* @param dataCurrent the current data
* @return a list of byte arrays containing the results
*/
private List<int[]> JCudaChangeMat(byte[] host_iMap1,
byte[] host_iMap2)
private List<int[]> JCudaChangeMat(byte[] host_iMap1,byte[] host_iMap2)
{
//return Arrays.asList(dataRef,dataCurrent);

Expand Down Expand Up @@ -261,12 +264,19 @@ private List<int[]> JCudaChangeMat(byte[] host_iMap1,
int crossdim = 45;
// ----
// opt function for different SIZEs
/*
int tiledimX = 150;
int tiledimY = 158;
int ntilesX = 86;
int ntilesY = 64;
*/
int tiledimX = 16;
int tiledimY = 16;
int ntilesX = 2;
int ntilesY = 2;
// ----
int mapsize = tiledimX * tiledimY * Integer.SIZE;
int mapsizeb = tiledimX * tiledimY * Byte.SIZE;
// change iMap or data*? about (i) type of data, (ii) string, (iii) duplication of data
// host_iMap1 = dataRef;
// host_iMap2 = dataCurrent;
Expand All @@ -280,42 +290,54 @@ private List<int[]> JCudaChangeMat(byte[] host_iMap1,
// DOVE METTIAMO IL .ptx ??

// Initialize the driver and create a context for the first device.
//System.out.println("Initializing driver:");
//System.out.println(" -cuInit(0)");
cuInit(0);
//System.out.println(" -CUdevice()");
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
//System.out.println(" -CUcontext()");
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);

// Load the ptx file.
//System.out.println("Loading ptx FILE...");
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);

// Obtain a function pointer to the "add" function.
//System.out.println("changemap MOD");
CUfunction changemap = new CUfunction();
cuModuleGetFunction(changemap, module, "changemap");
//System.out.println("...here...");
cuModuleGetFunction(changemap, module, "_Z9changemapPKhS0_iiiiiPjS1_");
//System.out.println("changemat MOD");
CUfunction changemat = new CUfunction();
cuModuleGetFunction(changemat, module, "changemat");
cuModuleGetFunction(changemat, module, "_Z9changematPjii");

// Allocate the device input data, and copy the
// host input data to the device
//System.out.println("dev_iMap1");
CUdeviceptr dev_iMap1 = new CUdeviceptr();
cuMemAlloc(dev_iMap1, mapsize );
cuMemcpyHtoD(dev_iMap1, Pointer.to(host_iMap1), mapsize);
cuMemAlloc(dev_iMap1, mapsizeb );
cuMemcpyHtoD(dev_iMap1, Pointer.to(host_iMap1), mapsizeb);
//System.out.println("dev_iMap2");
CUdeviceptr dev_iMap2 = new CUdeviceptr();
cuMemAlloc(dev_iMap2, mapsize );
cuMemcpyHtoD(dev_iMap2, Pointer.to(host_iMap2), mapsize);
cuMemAlloc(dev_iMap2, mapsizeb );
cuMemcpyHtoD(dev_iMap2, Pointer.to(host_iMap2), mapsizeb);

// Allocate device output memory
//System.out.println("dev_oMap");
CUdeviceptr dev_oMap = new CUdeviceptr();
cuMemAlloc(dev_oMap, mapsize);
//System.out.println("dev_chMat");
CUdeviceptr dev_chMat = new CUdeviceptr();
cuMemAlloc(dev_chMat, crossdim * crossdim * ntilesX * ntilesY);


System.out.println("first kernel");
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values.
Pointer kernelParameters1 = Pointer.to(
//Pointer.to(new int[]{numElements}), // DEFINE IT?!
Pointer.to(dev_iMap1),
Pointer.to(dev_iMap2),
Pointer.to(new int[]{tiledimX}),
Expand All @@ -327,35 +349,45 @@ private List<int[]> JCudaChangeMat(byte[] host_iMap1,
Pointer.to(dev_oMap)
);

//System.out.println("pointers done");
// Call the kernel function.
int blockSizeX = 1;
int blockSizeY = 1;
int blockSizeZ = 1;
int gridSizeX = 75;
int gridSizeY = 75;
int gridSizeZ = 1;
cuLaunchKernel(changemap,
//System.out.println("launch cuda kernel");
int status_k1 = cuLaunchKernel(changemap,
gridSizeX, blockSizeY, blockSizeZ, // Grid dimension
blockSizeX, gridSizeY, gridSizeZ, // Block dimension
0, null, // Shared memory size and stream
kernelParameters1, null // Kernel- and extra parameters
);
cuCtxSynchronize();
System.out.println(" k1 = "+status_k1);
//System.out.println("synchro");
int status_syn1 = cuCtxSynchronize();
System.out.println(" synchro_1 = "+status_syn1);

System.out.println("second kernel");
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values.
Pointer kernelParameters2 = Pointer.to(
Pointer.to(dev_chMat),
Pointer.to(new int[]{crossdim * crossdim}),
Pointer.to(new int[]{ntilesX * ntilesY})
);
cuLaunchKernel(changemat,
);
//System.out.println("pointers done");
//System.out.println("launch cuda kernel");
int status_k2 = cuLaunchKernel(changemat,
gridSizeX, blockSizeY, blockSizeZ, // Grid dimension
blockSizeX, gridSizeY, gridSizeZ, // Block dimension
0, null, // Shared memory size and stream
kernelParameters2, null // Kernel- and extra parameters
);
cuCtxSynchronize();
System.out.println(" k2 = "+status_k2);
int status_syn2 = cuCtxSynchronize();
System.out.println(" synchro_2 = "+status_syn2);

// Allocate host output memory and copy the device output
// to the host.
Expand All @@ -369,6 +401,7 @@ private List<int[]> JCudaChangeMat(byte[] host_iMap1,
cuMemFree(dev_iMap2);
cuMemFree(dev_oMap);
cuMemFree(dev_chMat);
//System.out.println("...here...");
return Arrays.asList(host_oMap,host_chMat);
}
}

0 comments on commit e87912c

Please sign in to comment.