diff --git a/Makefile.am b/Makefile.am index e994ab5f4..0baa048fd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -57,6 +57,7 @@ izhikevich_psc_exp_5s.h \ izhikevich_psc_exp.h \ locate.h \ multimeter.h \ +nested_loop.h \ nestgpu.h \ neuron_models.h \ ngpu_exception.h \ @@ -116,6 +117,7 @@ izhikevich_psc_exp_5s.cu \ izhikevich_psc_exp.cu \ locate.cu \ multimeter.cu \ +nested_loop.cu \ nestgpu.cu \ neuron_models.cu \ node_group.cu \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b721b22db..72c224a54 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -64,6 +64,7 @@ set ( nestgpu_sources izhikevich_psc_exp.h locate.h multimeter.h + nested_loop.h nestgpu_C.h nestgpu.h neuron_models.h @@ -120,9 +121,9 @@ set ( nestgpu_sources izhikevich_psc_exp_2s.cu izhikevich_psc_exp_5s.cu izhikevich_psc_exp.cu - locate.cu multimeter.cu nestgpu.cu + nested_loop.cu neuron_models.cu node_group.cu parrot_neuron.cu diff --git a/src/get_spike.cu b/src/get_spike.cu index 7acc5050c..2090eade0 100644 --- a/src/get_spike.cu +++ b/src/get_spike.cu @@ -57,7 +57,7 @@ __device__ double atomicAddDouble(double* address, double val) ////////////////////////////////////////////////////////////////////// // This is the function called by the nested loop // that collects the spikes -__device__ void CollectSpikeFunction(int i_spike, int i_syn) +__device__ void NestedLoopFunction0(int i_spike, int i_syn) { int i_source = SpikeSourceIdx[i_spike]; int i_conn = SpikeConnIdx[i_spike]; @@ -95,20 +95,6 @@ __device__ void CollectSpikeFunction(int i_spike, int i_syn) } //////////////////////////////////////////////////////////////// } - -__global__ void CollectSpikeKernel(int n_spikes, int *SpikeTargetNum) -{ - const int i_spike = blockIdx.x; - if (i_spike1) { - if (data[i] > val) i_right = i; - else if (data[i]. + * + */ + + + + + +#include +#include +#include + + //#include "cuda_error_nl.h" +#include "cuda_error.h" +#include "nested_loop.h" + +//TMP +#include "getRealTime.h" +// + +////////////////////////////////////////////////////////////////////// +// declare here the functions called by the nested loop +__device__ void NestedLoopFunction0(int ix, int iy); +__device__ void NestedLoopFunction1(int ix, int iy); +////////////////////////////////////////////////////////////////////// + +namespace NestedLoop +{ + PrefixScan prefix_scan_; + int *d_Ny_cumul_sum_; +} + +__device__ int locate(int val, int *data, int n) +{ + int i_left = 0; + int i_right = n-1; + int i = (i_left+i_right)/2; + while(i_right-i_left>1) { + if (data[i] > val) i_right = i; + else if (data[i]0) { + int grid_dim_x, grid_dim_y; + if (Ny_sum<65536*1024) { // max grid dim * max block dim + grid_dim_x = (Ny_sum+1023)/1024; + grid_dim_y = 1; + } + else { + grid_dim_x = 32; // I think it's not necessary to increase it + if (Ny_sum>grid_dim_x*1024*65535) { + throw ngpu_exception(std::string("Ny sum ") + std::to_string(Ny_sum) + + " larger than threshold " + + std::to_string(grid_dim_x*1024*65535)); + } + grid_dim_y = (Ny_sum + grid_dim_x*1024 -1) / (grid_dim_x*1024); + } + dim3 numBlocks(grid_dim_x, grid_dim_y); + //TMP + //double time_mark=getRealTime(); + // + switch (i_func) { + case 0: + CumulSumNestedLoopKernel0<<>> + (Nx, d_Ny_cumul_sum_, Ny_sum); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + break; + case 1: + CumulSumNestedLoopKernel1<<>> + (Nx, d_Ny_cumul_sum_, Ny_sum); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + break; + default: + throw ngpu_exception("unknown nested loop function"); + } + + //TMP + //printf("cst: %lf\n", getRealTime()-time_mark); + // + } + + return 0; +} + diff --git a/src/nested_loop.cu.full b/src/nested_loop.cu.full new file mode 100644 index 000000000..94b8594fe --- /dev/null +++ b/src/nested_loop.cu.full @@ -0,0 +1,591 @@ +/* + * This file is part of NESTGPU. + * + * Copyright (C) 2021 The NEST Initiative + * + * NESTGPU is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * NESTGPU is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with NESTGPU. If not, see . + * + */ + + + + + +#include +#include + +#include +#include +//#include + +#include "cuda_error_nl.h" +#include "nested_loop.h" + +//TMP +#include "getRealTime.h" +// + +////////////////////////////////////////////////////////////////////// +// declare here the function called by the nested loop +__device__ void NestedLoopFunction(int ix, int iy); +////////////////////////////////////////////////////////////////////// + +namespace NestedLoop +{ + #include "Ny_th.h" + void *d_sort_storage_; + size_t sort_storage_bytes_; + void *d_reduce_storage_; + size_t reduce_storage_bytes_; + + int Nx_max_; + int *d_max_Ny_; + int *d_sorted_Ny_; + + int *d_idx_; + int *d_sorted_idx_; + + int block_dim_x_; + int block_dim_y_; + int frame_area_; + float x_lim_; + +#ifdef WITH_CUMUL_SUM + PrefixScan prefix_scan_; + int *d_Ny_cumul_sum_; +#endif + +} + +////////////////////////////////////////////////////////////////////// +__global__ void SimpleNestedLoopKernel(int Nx, int *Ny) +{ + int ix = (blockIdx.x * blockDim.x) + threadIdx.x; + int iy = (blockIdx.y * blockDim.y) + threadIdx.y; + if (ix1) { + if (data[i] > val) i_right = i; + else if (data[i]>>(Nx, d_Ny); + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::ParallelInnerNestedLoop(int Nx, int *d_Ny) +{ + for (int ix=0; ix>>(ix, Ny); + // CudaCheckError(); // uncomment only for debugging + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::ParallelOuterNestedLoop(int Nx, int *d_Ny) +{ + ParallelOuterNestedLoopKernel<<<(Nx+1023)/1024, 1024>>>(Nx, d_Ny); + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::Frame1DNestedLoop(int Nx, int *d_Ny) +{ + if (Nx <= 0) return 0; + int dim_x, dim_y; + + // Run sorting operation + cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_, + d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_, + Nx); + + int ix0 = Nx; + while(ix0>0) { + CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int), + cudaMemcpyDeviceToHost)); + if (dim_y < 1) dim_y = 1; + dim_x = (frame_area_ - 1) / dim_y + 1; + ix0 -= dim_x; + if (ix0<0) { + dim_x += ix0; + ix0 = 0; + } + Frame1DNestedLoopKernel<<<(dim_x*dim_y+1023)/1024, 1024>>> + (ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_); + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::Frame2DNestedLoop(int Nx, int *d_Ny) +{ + if (Nx <= 0) return 0; + // Sort the pairs (ix, Ny) with ix=0,..,Nx-1 in ascending order of Ny. + // After the sorting operation, d_sorted_idx_ are the reordered indexes ix + // and d_sorted_Ny_ are the sorted values of Ny + cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_, + d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_, + Nx); + int ix0 = Nx; // proceeds from right to left + while(ix0>0) { + int dim_x, dim_y; // width and height of the rectangular frame + CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int), + cudaMemcpyDeviceToHost)); + if (dim_y < 1) dim_y = 1; + // frame_area_ is the fixed value of the the rectangular frame area + dim_x = (frame_area_ - 1) / dim_y + 1; // width of the rectangular frame + ix0 -= dim_x; // update the index value + if (ix0<0) { + dim_x += ix0; // adjust the width if ix0<0 + ix0 = 0; + } + dim3 threadsPerBlock(block_dim_x_, block_dim_y_); // block size + dim3 numBlocks((dim_x - 1)/threadsPerBlock.x + 1, + (dim_y - 1)/threadsPerBlock.y + 1); + // run a nested loop kernel on the rectangular frame + Frame2DNestedLoopKernel <<>> + (ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_); + + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::Smart1DNestedLoop(int Nx, int *d_Ny) +{ + // Find max value of Ny + cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny, + d_max_Ny_, Nx); + int max_Ny; + CudaSafeCall(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int), + cudaMemcpyDeviceToHost)); + if (Nx <= 0) return 0; + float f_Nx = 2.0*log((float)Nx)-5; + int i_Nx = (int)floor(f_Nx); + int Ny_th; + if (i_Nx<0) { + Ny_th = Ny_th_arr_[0]; + } + else if (i_Nx>=Ny_arr_size_-1) { + Ny_th = Ny_th_arr_[Ny_arr_size_-1]; + } + else { + float t = f_Nx - (float)i_Nx; + Ny_th = Ny_th_arr_[i_Nx]*(1.0 - t) + Ny_th_arr_[i_Nx+1]*t; + } + if (max_Ny>>(Nx, d_Ny); + //CudaCheckError(); // uncomment only for debugging + + int ix0 = Nx; + while(ix0>ix1) { + CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int), + cudaMemcpyDeviceToHost)); + dim_y -= Ny1; + if (dim_y<=0) break; + dim_x = (frame_area_ - 1) / dim_y + 1; + ix0 -= dim_x; + if (ix0>> + (ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_); + //CudaCheckError(); // uncomment only for debugging + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +int NestedLoop::Smart2DNestedLoop(int Nx, int *d_Ny) +{ + // Find max value of Ny + cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny, + d_max_Ny_, Nx); + int max_Ny; + CudaSafeCall(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int), + cudaMemcpyDeviceToHost)); + if (Nx <= 0) return 0; + float f_Nx = 2.0*log((float)Nx)-5; + int i_Nx = (int)floor(f_Nx); + int Ny_th; + if (i_Nx<0) { + Ny_th = Ny_th_arr_[0]; + } + else if (i_Nx>=Ny_arr_size_-1) { + Ny_th = Ny_th_arr_[Ny_arr_size_-1]; + } + else { + float t = f_Nx - (float)i_Nx; + Ny_th = Ny_th_arr_[i_Nx]*(1.0 - t) + Ny_th_arr_[i_Nx+1]*t; + } + if (max_Ny>>(Nx, d_Ny); + //CudaCheckError(); // uncomment only for debugging + + int ix0 = Nx; + while(ix0>ix1) { + CudaSafeCall(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int), + cudaMemcpyDeviceToHost)); + dim_y -= Ny1; + if (dim_y<=0) break; + dim_x = (frame_area_ - 1) / dim_y + 1; + ix0 -= dim_x; + if (ix0>> + (ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_); + //CudaCheckError(); // uncomment only for debugging + } + cudaDeviceSynchronize(); + CudaCheckError(); + + return 0; +} + +////////////////////////////////////////////////////////////////////// +#ifdef WITH_CUMUL_SUM +int NestedLoop::CumulSumNestedLoop(int Nx, int *d_Ny) +{ + //TMP + //double time_mark=getRealTime(); + // + prefix_scan_.Scan(d_Ny_cumul_sum_, d_Ny, Nx+1); + //TMP + //printf("pst: %lf\n", getRealTime()-time_mark); + // + int Ny_sum; + CudaSafeCall(cudaMemcpy(&Ny_sum, &d_Ny_cumul_sum_[Nx], + sizeof(int), cudaMemcpyDeviceToHost)); + + //printf("CSNL: %d %d\n", Nx, Ny_sum); + + //printf("Ny_sum %u\n", Ny_sum); + //temporary - remove + /* + if (Ny_sum==0) { + printf("Nx %d\n", Nx); + for (int i=0; i0) { + int grid_dim_x, grid_dim_y; + if (Ny_sum<65536*1024) { // max grid dim * max block dim + grid_dim_x = (Ny_sum+1023)/1024; + grid_dim_y = 1; + } + else { + grid_dim_x = 64; // I think it's not necessary to increase it + if (Ny_sum>grid_dim_x*1024*65535) { + printf("Ny sum %d larger than threshold %d\n", + Ny_sum, grid_dim_x*1024*65535); + exit(-1); + } + grid_dim_y = (Ny_sum + grid_dim_x*1024 -1) / (grid_dim_x*1024); + } + dim3 numBlocks(grid_dim_x, grid_dim_y); + //TMP + //double time_mark=getRealTime(); + // + CumulSumNestedLoopKernel<<>>(Nx, d_Ny_cumul_sum_, Ny_sum); + + cudaDeviceSynchronize(); + CudaCheckError(); + //TMP + //printf("cst: %lf\n", getRealTime()-time_mark); + // + } + + return 0; +} +#endif diff --git a/src/nested_loop.h b/src/nested_loop.h new file mode 100644 index 000000000..14d398d35 --- /dev/null +++ b/src/nested_loop.h @@ -0,0 +1,41 @@ +/* + * This file is part of NESTGPU. + * + * Copyright (C) 2021 The NEST Initiative + * + * NESTGPU is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * NESTGPU is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with NESTGPU. If not, see . + * + */ + + + + + +#ifndef NESTEDLOOP_H +#define NESTEDLOOP_H + +#include "prefix_scan.h" + +namespace NestedLoop +{ + extern PrefixScan prefix_scan_; + + int Init(); + int Run(int Nx, int *d_Ny, int i_func); + int CumulSumNestedLoop(int Nx, int *d_Ny, int i_func); + + int Free(); +} + +#endif diff --git a/src/nestgpu.cu b/src/nestgpu.cu index 8124a912f..7a76ec722 100644 --- a/src/nestgpu.cu +++ b/src/nestgpu.cu @@ -44,6 +44,7 @@ #include "getRealTime.h" #include "random.h" #include "nestgpu.h" +#include "nested_loop.h" #include "dir_connect.h" #include "rev_spike.h" #include "spike_mpi.h" @@ -135,7 +136,8 @@ NESTGPU::NESTGPU() connect_mpi_->net_connection_ = net_connection_; connect_mpi_->remote_spike_height_ = false; #endif - + + NestedLoop::Init(); SetRandomSeed(54321ULL); SpikeBufferUpdate_time_ = 0; @@ -549,9 +551,7 @@ int NESTGPU::SimulationStep() gpuErrchk( cudaDeviceSynchronize() ); if (n_spikes > 0) { time_mark = getRealTime(); - CollectSpikeKernel<<>>(n_spikes, d_SpikeTargetNum); - gpuErrchk(cudaPeekAtLastError()); - + NestedLoop::Run(n_spikes, d_SpikeTargetNum, 0); NestedLoop_time_ += (getRealTime() - time_mark); } time_mark = getRealTime(); @@ -611,8 +611,7 @@ int NESTGPU::SimulationStep() gpuErrchk(cudaMemcpy(&n_rev_spikes, d_RevSpikeNum, sizeof(unsigned int), cudaMemcpyDeviceToHost)); if (n_rev_spikes > 0) { - SynapseUpdateKernel<<>>(n_rev_spikes, d_RevSpikeNConn); - gpuErrchk(cudaPeekAtLastError()); + NestedLoop::Run(n_rev_spikes, d_RevSpikeNConn, 1); } //RevSpikeBufferUpdate_time_ += (getRealTime() - time_mark); } diff --git a/src/rev_spike.cu b/src/rev_spike.cu index 86ff99eee..70ad89e11 100644 --- a/src/rev_spike.cu +++ b/src/rev_spike.cu @@ -50,7 +50,7 @@ __device__ int *RevSpikeNConn; ////////////////////////////////////////////////////////////////////// // This is the function called by the nested loop // that makes use of positive post-pre spike time difference -__device__ void SynapseUpdateFunction(int i_spike, int i_target_rev_conn) +__device__ void NestedLoopFunction1(int i_spike, int i_target_rev_conn) { unsigned int target = RevSpikeTarget[i_spike]; unsigned int i_conn = TargetRevConnection[target][i_target_rev_conn]; @@ -65,18 +65,7 @@ __device__ void SynapseUpdateFunction(int i_spike, int i_target_rev_conn) } } } - -__global__ void SynapseUpdateKernel(int n_rev_spikes, int *RevSpikeNConn) -{ - const int i_spike = blockIdx.x; - if (i_spike