diff --git a/CMake/hoomd/HOOMDCUDASetup.cmake b/CMake/hoomd/HOOMDCUDASetup.cmake index 73a5fe43b6..e7f5ed0196 100644 --- a/CMake/hoomd/HOOMDCUDASetup.cmake +++ b/CMake/hoomd/HOOMDCUDASetup.cmake @@ -53,9 +53,9 @@ if (ENABLE_CUDA) # setup nvcc to build for all CUDA architectures. Allow user to modify the list if desired if (CUDA_VERSION VERSION_GREATER 8.99) - set(CUDA_ARCH_LIST 30 35 50 60 70 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.") + set(CUDA_ARCH_LIST 35 50 60 70 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.") elseif (CUDA_VERSION VERSION_GREATER 7.99) - set(CUDA_ARCH_LIST 30 35 50 60 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.") + set(CUDA_ARCH_LIST 35 50 60 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.") endif() foreach(_cuda_arch ${CUDA_ARCH_LIST}) @@ -69,8 +69,8 @@ if (ENABLE_CUDA) list(GET _cuda_arch_list_sorted -1 _cuda_max_arch) add_definitions(-DCUDA_ARCH=${_cuda_min_arch}) - if (_cuda_min_arch LESS 30) - message(SEND_ERROR "HOOMD requires compute 3.0 or newer") + if (_cuda_min_arch LESS 35) + message(SEND_ERROR "HOOMD requires compute 3.5 or newer") endif () # only generate ptx code for the maximum supported CUDA_ARCH (saves on file size) diff --git a/hoomd/TextureTools.h b/hoomd/TextureTools.h index e82aeaf03a..bc35bf779e 100644 --- a/hoomd/TextureTools.h +++ b/hoomd/TextureTools.h @@ -10,161 +10,30 @@ /*! \file TextureTools.h \brief Utilities for working with textures - TextureTools.h exists to aid in defining Scalar textures which may be either float or double. It aims to simplify - code that reads from these textures so that the amount of conditional code is simplified to be entirely within - this header. + TextureTools.h previously existed to aid in defining Scalar textures which may be either float or double. - Planning for the future (__ldg), the fetch methods will also take in a pointer to the memory. That way, the initial - work done to convert the texture loads over to the single/double will also make it easy to change over to __ldg - in a single spot. + Now, it only provides a __ldg() overload for double4. */ #include "HOOMDMath.h" #ifdef NVCC -//! Fetch an unsigned int from texture memory. +//! Fetch a double4 value from texture memory. /*! This function should be called whenever a CUDA kernel wants to retrieve a - unsigned int value from texture memory. + double4 value from read only memory. - \param ptr Pointer to bound memory - \param tex_ref Texture in which the desired values are stored. - \param ii Index at which to look. + \param ptr Pointer to read */ -__device__ inline unsigned int texFetchUint(const unsigned int *ptr, texture tex_ref, unsigned int ii) +__device__ inline double4 __ldg(const double4 *ptr) { - #if __CUDA_ARCH__ >= 350 - return __ldg(ptr+ii); - #else - return tex1Dfetch(tex_ref, ii); - #endif - } - -#ifdef SINGLE_PRECISION - -typedef texture scalar_tex_t; -typedef texture scalar2_tex_t; -typedef texture scalar4_tex_t; - -//! Fetch a Scalar value from texture memory. -/*! This function should be called whenever a CUDA kernel wants to retrieve a - Scalar value from texture memory. - - \param ptr Pointer to bound memory - \param tex_ref Texture in which the desired values are stored. - \param ii Index at which to look. -*/ -__device__ inline Scalar texFetchScalar(const Scalar *ptr, texture tex_ref, unsigned int ii) - { - #if __CUDA_ARCH__ >= 350 - return __ldg(ptr+ii); - #else - return tex1Dfetch(tex_ref, ii); - #endif - } - -//! Fetch a Scalar2 value from texture memory. -/*! This function should be called whenever a CUDA kernel wants to retrieve a - Scalar2 value from texture memory. - - \param ptr Pointer to bound memory - \param tex_ref Texture in which the desired values are stored. - \param ii Index at which to look. -*/ -__device__ inline Scalar2 texFetchScalar2(const Scalar2 *ptr, texture tex_ref, unsigned int ii) - { - #if __CUDA_ARCH__ >= 350 - return __ldg(ptr+ii); - #else - return tex1Dfetch(tex_ref, ii); - #endif - } - -//! Fetch a Scalar4 value from texture memory. -/*! This function should called whenever a CUDA kernel wants to retrieve a - Scalar4 value from texture memory. - - \param ptr Pointer to bound memory - \param tex_ref Texture in which the desired values are stored. - \param ii Index at which to look. -*/ -__device__ inline Scalar4 texFetchScalar4(const Scalar4 *ptr, texture tex_ref, unsigned int ii) - { - #if __CUDA_ARCH__ >= 350 - return __ldg(ptr+ii); - #else - return tex1Dfetch(tex_ref, ii); - #endif - } - -#else -typedef texture scalar_tex_t; -typedef texture scalar2_tex_t; -typedef texture scalar4_tex_t; - -//! Fetch a Scalar value from texture memory. -/*! This function should be called whenever a CUDA kernel wants to retrieve a - Scalar value from texture memory. - - \param ptr Pointer to bound memory - \param tex_ref Texture in which the desired values are stored. - \param ii Index at which to look. -*/ -__device__ inline Scalar texFetchScalar(const Scalar *ptr, texture tex_ref, unsigned int ii) - { - #if __CUDA_ARCH__ >= 350 - return __ldg(ptr+ii); - #else - int2 val = tex1Dfetch(tex_ref, ii); - return Scalar(__hiloint2double(val.y, val.x)); - #endif - } - -//! Fetch a Scalar2 value from texture memory. -/*! This function should be called whenever a CUDA kernel wants to retrieve a - Scalar2 value from texture memory. - - \param ptr Pointer to bound memory - \param tex_ref Texture in which the desired values are stored. - \param ii Index at which to look. -*/ -__device__ inline Scalar2 texFetchScalar2(const Scalar2* ptr, texture tex_ref, unsigned int ii) - { - #if __CUDA_ARCH__ >= 350 - return __ldg(ptr+ii); - #else - int4 val = tex1Dfetch(tex_ref, ii); - return make_scalar2(__hiloint2double(val.y, val.x), - __hiloint2double(val.w, val.z)); - #endif - } - -//! Fetch a Scalar4 value from texture memory. -/*! This function should be called whenever a CUDA kernel wants to retrieve a - Scalar4 value from texture memory. - - \param ptr Pointer to bound memory - \param tex_ref Texture in which the desired values are stored. - \param ii Index at which to look. -*/ -__device__ inline Scalar4 texFetchScalar4(const Scalar4 *ptr, texture tex_ref, unsigned int ii) - { - unsigned int idx = 2*ii; - #if __CUDA_ARCH__ >= 350 - int4 part1 = __ldg(((int4 *)ptr)+idx);; - int4 part2 = __ldg(((int4 *)ptr)+idx+1);; - #else - int4 part1 = tex1Dfetch(tex_ref, idx); - int4 part2 = tex1Dfetch(tex_ref, idx+1); - #endif - return make_scalar4(__hiloint2double(part1.y, part1.x), + int4 part1 = __ldg(((int4 *)ptr));; + int4 part2 = __ldg(((int4 *)ptr)+1);; + return make_double4(__hiloint2double(part1.y, part1.x), __hiloint2double(part1.w, part1.z), __hiloint2double(part2.y, part2.x), __hiloint2double(part2.w, part2.z)); } #endif -#endif - - #endif // __HOOMD_MATH_H__ diff --git a/hoomd/cgcmm/CGCMMAngleForceComputeGPU.cc b/hoomd/cgcmm/CGCMMAngleForceComputeGPU.cc index bbda521137..03e49c01a5 100644 --- a/hoomd/cgcmm/CGCMMAngleForceComputeGPU.cc +++ b/hoomd/cgcmm/CGCMMAngleForceComputeGPU.cc @@ -133,8 +133,7 @@ void CGCMMAngleForceComputeGPU::computeForces(unsigned int timestep) d_CGCMMsr.data, d_CGCMMepow.data, m_CGCMMAngle_data->getNTypes(), - m_tuner->getParam(), - m_exec_conf->getComputeCapability()); + m_tuner->getParam()); if(m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); diff --git a/hoomd/cgcmm/CGCMMAngleForceGPU.cu b/hoomd/cgcmm/CGCMMAngleForceGPU.cu index 5d4740580d..1842267da5 100644 --- a/hoomd/cgcmm/CGCMMAngleForceGPU.cu +++ b/hoomd/cgcmm/CGCMMAngleForceGPU.cu @@ -16,15 +16,6 @@ \brief Defines GPU kernel code for calculating the CGCMM angle forces. Used by CGCMMAngleForceComputeGPU. */ -//! Texture for reading angle parameters -scalar2_tex_t angle_params_tex; - -//! Texture for reading angle CGCMM S-R parameters -scalar2_tex_t angle_CGCMMsr_tex; // MISSING EPSILON!!! sigma=.x, rcut=.y - -//! Texture for reading angle CGCMM Epsilon-pow/pref parameters -scalar4_tex_t angle_CGCMMepow_tex; // now with EPSILON=.x, pow1=.y, pow2=.z, pref=.w - //! Kernel for calculating CGCMM angle forces on the GPU /*! \param d_force Device memory to write computed forces \param d_virial Device memory to write computed virials @@ -124,7 +115,7 @@ extern "C" __global__ void gpu_compute_CGCMM_angle_forces_kernel(Scalar4* d_forc dac = box.minImage(dac); // get the angle parameters (MEM TRANSFER: 8 bytes) - Scalar2 params = texFetchScalar2(d_params, angle_params_tex, cur_angle_type); + Scalar2 params = __ldg(d_params + cur_angle_type); Scalar K = params.x; Scalar t_0 = params.y; @@ -154,14 +145,14 @@ extern "C" __global__ void gpu_compute_CGCMM_angle_forces_kernel(Scalar4* d_forc vac[i] = Scalar(0.0); // get the angle E-S-R parameters (MEM TRANSFER: 12 bytes) - const Scalar2 cgSR = texFetchScalar2(d_CGCMMsr, angle_CGCMMsr_tex, cur_angle_type); + const Scalar2 cgSR = __ldg(d_CGCMMsr + cur_angle_type); Scalar cgsigma = cgSR.x; Scalar cgrcut = cgSR.y; if (rac < cgrcut) { - const Scalar4 cgEPOW = texFetchScalar4(d_CGCMMepow, angle_CGCMMepow_tex, cur_angle_type); + const Scalar4 cgEPOW = __ldg(d_CGCMMepow + cur_angle_type); // get the angle pow/pref parameters (MEM TRANSFER: 12 bytes) Scalar cgeps = cgEPOW.x; @@ -282,8 +273,7 @@ cudaError_t gpu_compute_CGCMM_angle_forces(Scalar4* d_force, Scalar2 *d_CGCMMsr, Scalar4 *d_CGCMMepow, unsigned int n_angle_types, - int block_size, - const unsigned int compute_capability) + int block_size) { assert(d_params); assert(d_CGCMMsr); @@ -306,22 +296,6 @@ cudaError_t gpu_compute_CGCMM_angle_forces(Scalar4* d_force, dim3 grid( (int)ceil((double)N / (double)run_block_size), 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the textures on pre sm 35 arches - if (compute_capability < 350) - { - cudaError_t error = cudaBindTexture(0, angle_params_tex, d_params, sizeof(Scalar2) * n_angle_types); - if (error != cudaSuccess) - return error; - - error = cudaBindTexture(0, angle_CGCMMsr_tex, d_CGCMMsr, sizeof(Scalar2) * n_angle_types); - if (error != cudaSuccess) - return error; - - error = cudaBindTexture(0, angle_CGCMMepow_tex, d_CGCMMepow, sizeof(Scalar4) * n_angle_types); - if (error != cudaSuccess) - return error; - } - // run the kernel gpu_compute_CGCMM_angle_forces_kernel<<< grid, threads>>>(d_force, d_virial, diff --git a/hoomd/cgcmm/CGCMMAngleForceGPU.cuh b/hoomd/cgcmm/CGCMMAngleForceGPU.cuh index f1a8f2942a..e2bdc874d3 100644 --- a/hoomd/cgcmm/CGCMMAngleForceGPU.cuh +++ b/hoomd/cgcmm/CGCMMAngleForceGPU.cuh @@ -30,7 +30,6 @@ cudaError_t gpu_compute_CGCMM_angle_forces(Scalar4* d_force, Scalar2 *d_CGCMMsr, Scalar4 *d_CGCMMepow, unsigned int n_angle_types, - int block_size, - const unsigned int compute_capability); + int block_size); #endif diff --git a/hoomd/cgcmm/CGCMMForceComputeGPU.cc b/hoomd/cgcmm/CGCMMForceComputeGPU.cc index bde66b2037..d039b2c356 100644 --- a/hoomd/cgcmm/CGCMMForceComputeGPU.cc +++ b/hoomd/cgcmm/CGCMMForceComputeGPU.cc @@ -165,9 +165,7 @@ void CGCMMForceComputeGPU::computeForces(unsigned int timestep) this->m_nlist->getNListArray().getPitch(), m_pdata->getNTypes(), m_r_cut * m_r_cut, - m_block_size, - m_exec_conf->getComputeCapability()/10, - m_exec_conf->dev_prop.maxTexture1DLinear); + m_block_size); if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); diff --git a/hoomd/cgcmm/CGCMMForceGPU.cu b/hoomd/cgcmm/CGCMMForceGPU.cu index 16714d2120..066f17022c 100644 --- a/hoomd/cgcmm/CGCMMForceGPU.cu +++ b/hoomd/cgcmm/CGCMMForceGPU.cu @@ -13,11 +13,6 @@ \brief Defines GPU kernel code for calculating the Lennard-Jones pair forces. Used by CGCMMForceComputeGPU. */ -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; -//! Texture for reading the neighbor list -texture nlist_tex; - //! Kernel for calculating CG-CMM Lennard-Jones forces /*! This kernel is called to calculate the Lennard-Jones forces on all N particles for the CG-CMM model potential. @@ -45,7 +40,6 @@ texture nlist_tex; Each thread will calculate the total force on one particle. The neighborlist is arranged in columns so that reads are fully coalesced when doing this. */ -template __global__ void gpu_compute_cgcmm_forces_kernel(Scalar4* d_force, Scalar* d_virial, const unsigned int virial_pitch, @@ -80,7 +74,7 @@ __global__ void gpu_compute_cgcmm_forces_kernel(Scalar4* d_force, // read in the position of our particle. // (MEM TRANSFER: 16 bytes) - Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx); + Scalar4 postype = __ldg(d_pos + idx); Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z); // initialize the force to 0 @@ -92,14 +86,7 @@ __global__ void gpu_compute_cgcmm_forces_kernel(Scalar4* d_force, // prefetch neighbor index unsigned int cur_neigh = 0; unsigned int next_neigh(0); - if (use_gmem_nlist) - { - next_neigh = d_nlist[head_idx]; - } - else - { - next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx); - } + next_neigh = __ldg(d_nlist + head_idx); // loop over neighbors for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++) @@ -107,17 +94,10 @@ __global__ void gpu_compute_cgcmm_forces_kernel(Scalar4* d_force, // read the current neighbor index (MEM TRANSFER: 4 bytes) // prefetch the next value and set the current one cur_neigh = next_neigh; - if (use_gmem_nlist) - { - next_neigh = d_nlist[head_idx + neigh_idx + 1]; - } - else - { - next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx+1); - } + next_neigh = __ldg(d_nlist + head_idx + neigh_idx+1); // get the neighbor's position (MEM TRANSFER: 16 bytes) - Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh); + Scalar4 neigh_postype = __ldg(d_pos + cur_neigh); Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z); // calculate dr (with periodic boundary conditions) @@ -212,9 +192,7 @@ cudaError_t gpu_compute_cgcmm_forces(Scalar4* d_force, const unsigned int size_nlist, const unsigned int coeff_width, const Scalar r_cutsq, - const unsigned int block_size, - const unsigned int compute_capability, - const unsigned int max_tex1d_width) + const unsigned int block_size) { assert(d_coeffs); assert(coeff_width > 0); @@ -223,56 +201,18 @@ cudaError_t gpu_compute_cgcmm_forces(Scalar4* d_force, dim3 grid( (int)ceil((double)N / (double)block_size), 1, 1); dim3 threads(block_size, 1, 1); - // bind the texture - if (compute_capability < 35) - { - pdata_pos_tex.normalized = false; - pdata_pos_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*N); - if (error != cudaSuccess) - return error; - - if (size_nlist <= max_tex1d_width) - { - nlist_tex.normalized = false; - nlist_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, nlist_tex, d_nlist, sizeof(unsigned int)*size_nlist); - if (error != cudaSuccess) - return error; - } - } - - // run the kernel - if (compute_capability < 35 && size_nlist > max_tex1d_width) - { // fall back to slow global loads when the neighbor list is too big for texture memory - gpu_compute_cgcmm_forces_kernel<1><<< grid, threads, sizeof(Scalar4)*coeff_width*coeff_width >>>(d_force, - d_virial, - virial_pitch, - N, - d_pos, - box, - d_n_neigh, - d_nlist, - d_head_list, - d_coeffs, - coeff_width, - r_cutsq); - } - else - { - gpu_compute_cgcmm_forces_kernel<0><<< grid, threads, sizeof(Scalar4)*coeff_width*coeff_width >>>(d_force, - d_virial, - virial_pitch, - N, - d_pos, - box, - d_n_neigh, - d_nlist, - d_head_list, - d_coeffs, - coeff_width, - r_cutsq); - } + gpu_compute_cgcmm_forces_kernel<<< grid, threads, sizeof(Scalar4)*coeff_width*coeff_width >>>(d_force, + d_virial, + virial_pitch, + N, + d_pos, + box, + d_n_neigh, + d_nlist, + d_head_list, + d_coeffs, + coeff_width, + r_cutsq); return cudaSuccess; } diff --git a/hoomd/cgcmm/CGCMMForceGPU.cuh b/hoomd/cgcmm/CGCMMForceGPU.cuh index eac20d6ef7..cd48bbd7e3 100644 --- a/hoomd/cgcmm/CGCMMForceGPU.cuh +++ b/hoomd/cgcmm/CGCMMForceGPU.cuh @@ -29,8 +29,6 @@ cudaError_t gpu_compute_cgcmm_forces(Scalar4* d_force, const unsigned int size_nlist, const unsigned int coeff_width, const Scalar r_cutsq, - const unsigned int block_size, - const unsigned int compute_capability, - const unsigned int max_tex1d_width); + const unsigned int block_size); #endif diff --git a/hoomd/dem/DEM2DForceGPU.cu b/hoomd/dem/DEM2DForceGPU.cu index c455c503f4..ab43f2d79b 100644 --- a/hoomd/dem/DEM2DForceGPU.cu +++ b/hoomd/dem/DEM2DForceGPU.cu @@ -26,12 +26,6 @@ \brief Defines GPU kernel code for calculating conservative DEM pair forces. Used by DEM2DForceComputeGPU. */ -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; -scalar4_tex_t pdata_quat_tex; -scalar_tex_t pdata_diam_tex; -scalar4_tex_t pdata_velocity_tex; - //! Kernel for calculating 2D DEM forces /*! This kernel is called to calculate the DEM forces for all N particles. @@ -171,22 +165,22 @@ __global__ void gpu_compute_dem2d_forces_kernel(const Scalar4 *d_pos, const unsigned int myHead(d_head_list[partIdx]); // fetch position and orientation of this particle - const Scalar4 postype(texFetchScalar4(d_pos,pdata_pos_tex, partIdx)); + const Scalar4 postype(__ldg(d_pos + partIdx)); const vec3 pos_i(postype.x, postype.y, 0); const unsigned int type_i(__scalar_as_int(postype.w)); - const Scalar4 quati(texFetchScalar4(d_quat, pdata_quat_tex, partIdx)); + const Scalar4 quati(__ldg(d_quat + partIdx)); const quat quat_i(quati.x, vec3(quati.y, quati.z, quati.w)); Scalar di = 0.0f; if (Evaluator::needsDiameter()) - di = texFetchScalar(d_diam, pdata_diam_tex, partIdx); + di = __ldg(d_diam + partIdx); else di += 1.0f; //shut up compiler warning. Vestigial from HOOMD vec3 vi; if (Evaluator::needsVelocity()) vi = vec3( - texFetchScalar4(d_velocity, pdata_velocity_tex, partIdx)); + __ldg(d_velocity + partIdx)); for(unsigned int featureEpoch(0); featureEpoch < (numShapeVerts[type_i] + blockDim.x - 1)/blockDim.x; ++featureEpoch) @@ -214,7 +208,7 @@ __global__ void gpu_compute_dem2d_forces_kernel(const Scalar4 *d_pos, next_neigh = d_nlist[myHead + neigh_idx + 1]; // grab the position and type of the neighbor - const Scalar4 neigh_postype(texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh)); + const Scalar4 neigh_postype(__ldg(d_pos + cur_neigh)); const unsigned int neigh_type(__scalar_as_int(neigh_postype.w)); const vec3 neigh_pos(neigh_postype.x, neigh_postype.y, 0); @@ -230,20 +224,20 @@ __global__ void gpu_compute_dem2d_forces_kernel(const Scalar4 *d_pos, if (Evaluator::needsDiameter()) { Scalar dj(0); - dj = texFetchScalar(d_diam, pdata_diam_tex, cur_neigh); + dj = __ldg(d_diam + cur_neigh); evaluator.setDiameter(di, dj); } if(evaluator.withinCutoff(rsq,r_cutsq)) { // fetch neighbor's orientation - const Scalar4 neighQuatF(texFetchScalar4(d_quat, pdata_quat_tex, cur_neigh)); + const Scalar4 neighQuatF(__ldg(d_quat + cur_neigh)); const quat neighQuat( neighQuatF.x, vec3(neighQuatF.y, neighQuatF.z, neighQuatF.w)); if (Evaluator::needsVelocity()) { - Scalar4 vj(texFetchScalar4(d_velocity, pdata_velocity_tex, cur_neigh)); + Scalar4 vj(__ldg(d_velocity + cur_neigh)); evaluator.setVelocity(vi - vec3(vj)); } @@ -417,28 +411,6 @@ cudaError_t gpu_compute_dem2d_forces(Scalar4* d_force, dim3 threads(numFeatures, particlesPerBlock, 2); - // bind the textures for position and orientation - pdata_pos_tex.normalized = false; - pdata_pos_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*(N+n_ghosts)); - if (error != cudaSuccess) - return error; - pdata_quat_tex.normalized = false; - pdata_quat_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, pdata_quat_tex, d_quat, sizeof(Scalar4)*(N+n_ghosts)); - if (error != cudaSuccess) - return error; - pdata_diam_tex.normalized = false; - pdata_diam_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, pdata_diam_tex, d_diam, sizeof(Scalar)*(N+n_ghosts)); - if (error != cudaSuccess) - return error; - pdata_velocity_tex.normalized = false; - pdata_velocity_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, pdata_velocity_tex, d_velocity, sizeof(Scalar4)*(N+n_ghosts)); - if (error != cudaSuccess) - return error; - // Calculate the amount of shared memory required size_t shmSize(vertexCount*sizeof(Real2) + n_shapes*2*sizeof(unsigned int) + particlesPerBlock*(sizeof(Real4) + 6*sizeof(Real))); diff --git a/hoomd/dem/DEM3DForceGPU.cu b/hoomd/dem/DEM3DForceGPU.cu index f905e6d0f4..2d77fa95af 100644 --- a/hoomd/dem/DEM3DForceGPU.cu +++ b/hoomd/dem/DEM3DForceGPU.cu @@ -27,12 +27,6 @@ \brief Defines GPU kernel code for calculating conservative DEM pair forces. Used by DEM3DForceComputeGPU. */ -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; -scalar4_tex_t pdata_quat_tex; -scalar_tex_t pdata_diam_tex; -scalar4_tex_t pdata_velocity_tex; - //! Kernel for calculating 3D DEM forces /*! This kernel is called to calculate the DEM forces for all N particles. @@ -234,22 +228,22 @@ __global__ void gpu_compute_dem3d_forces_kernel( const unsigned int myHead(d_head_list[partIdx]); // fetch position and orientation of this particle - const Scalar4 postype(texFetchScalar4(d_pos,pdata_pos_tex, partIdx)); + const Scalar4 postype(__ldg(d_pos + partIdx)); const vec3 pos_i(postype.x, postype.y, postype.z); const unsigned int type_i(__scalar_as_int(postype.w)); - const Scalar4 quati(texFetchScalar4(d_quat, pdata_quat_tex, partIdx)); + const Scalar4 quati(__ldg(d_quat + partIdx)); const quat quat_i(quati.x, vec3(quati.y, quati.z, quati.w)); Scalar di = 0.0f; if (Evaluator::needsDiameter()) - di = texFetchScalar(d_diam, pdata_diam_tex, partIdx); + di = __ldg(d_diam + partIdx); else di += 1.0f; //shut up compiler warning. Vestigial from HOOMD vec3 vi; if (Evaluator::needsVelocity()) vi = vec3( - texFetchScalar4(d_velocity, pdata_velocity_tex, partIdx)); + __ldg(d_velocity + partIdx)); for(unsigned int featureEpoch(0); featureEpoch < (maxFeatures + blockDim.y - 1)/blockDim.y; ++featureEpoch) @@ -283,7 +277,7 @@ __global__ void gpu_compute_dem3d_forces_kernel( next_neigh = d_nlist[myHead + neigh_idx + 1]; // grab the position and type of the neighbor - const Scalar4 neigh_postype(texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh)); + const Scalar4 neigh_postype(__ldg(d_pos + cur_neigh)); const unsigned int type_j(__scalar_as_int(neigh_postype.w)); const vec3 neigh_pos(neigh_postype.x, neigh_postype.y, neigh_postype.z); @@ -299,7 +293,7 @@ __global__ void gpu_compute_dem3d_forces_kernel( Scalar dj(0); if (Evaluator::needsDiameter()) { - dj = texFetchScalar(d_diam, pdata_diam_tex, cur_neigh); + dj = __ldg(d_diam + cur_neigh); evaluator.setDiameter(di, dj); } else @@ -308,13 +302,13 @@ __global__ void gpu_compute_dem3d_forces_kernel( if(evaluator.withinCutoff(rsq, r_cutsq)) { // fetch neighbor's orientation - const Scalar4 neighQuatF(texFetchScalar4(d_quat, pdata_quat_tex, cur_neigh)); + const Scalar4 neighQuatF(__ldg(d_quat + cur_neigh)); const quat neighQuat( neighQuatF.x, vec3(neighQuatF.y, neighQuatF.z, neighQuatF.w)); if (Evaluator::needsVelocity()) { - Scalar4 vj(texFetchScalar4(d_velocity, pdata_velocity_tex, cur_neigh)); + Scalar4 vj(__ldg(d_velocity + cur_neigh)); evaluator.setVelocity(vi - vec3(vj)); } @@ -515,28 +509,6 @@ cudaError_t gpu_compute_dem3d_forces( dim3 threads(particlesPerBlock, numFeatures, 1); - // bind the textures for position and orientation - pdata_pos_tex.normalized = false; - pdata_pos_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*(N+n_ghosts)); - if (error != cudaSuccess) - return error; - pdata_quat_tex.normalized = false; - pdata_quat_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, pdata_quat_tex, d_quat, sizeof(Scalar4)*(N+n_ghosts)); - if (error != cudaSuccess) - return error; - pdata_diam_tex.normalized = false; - pdata_diam_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, pdata_diam_tex, d_diam, sizeof(Scalar)*(N+n_ghosts)); - if (error != cudaSuccess) - return error; - pdata_velocity_tex.normalized = false; - pdata_velocity_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, pdata_velocity_tex, d_velocity, sizeof(Scalar4)*(N+n_ghosts)); - if (error != cudaSuccess) - return error; - // Calculate the amount of shared memory required const size_t shmSize(2*particlesPerBlock*sizeof(Real4) + 6*particlesPerBlock*sizeof(Real) + // forces, torques, virials per-particle 2*numFaces*sizeof(unsigned int) + // face->next face, face->first vertex in face diff --git a/hoomd/hpmc/ComputeFreeVolumeGPU.cuh b/hoomd/hpmc/ComputeFreeVolumeGPU.cuh index db7a6126bc..0be95ecb6a 100644 --- a/hoomd/hpmc/ComputeFreeVolumeGPU.cuh +++ b/hoomd/hpmc/ComputeFreeVolumeGPU.cuh @@ -136,10 +136,6 @@ template< class Shape > cudaError_t gpu_hpmc_free_volume(const hpmc_free_volume_args_t &args, const typename Shape::param_type *d_params); #ifdef NVCC -//! Texture for reading postype -scalar4_tex_t free_volume_postype_tex; -//! Texture for reading orientation -scalar4_tex_t free_volume_orientation_tex; //! Compute the cell that a particle sits in __device__ inline unsigned int compute_cell_idx(const Scalar3 p, @@ -329,18 +325,14 @@ __global__ void gpu_hpmc_free_volume_kernel(unsigned int n_sample, if (local_k < excell_size) { // read in position, and orientation of neighboring particle - #if ( __CUDA_ARCH__ > 300) unsigned int j = __ldg(&d_excell_idx[excli(local_k, my_cell)]); - #else - unsigned int j = d_excell_idx[excli(local_k, my_cell)]; - #endif - Scalar4 postype_j = texFetchScalar4(d_postype, free_volume_postype_tex, j); + Scalar4 postype_j = __ldg(d_postype + j); Scalar4 orientation_j = make_scalar4(1,0,0,0); unsigned int typ_j = __scalar_as_int(postype_j.w); Shape shape_j(quat(orientation_j), s_params[typ_j]); if (shape_j.hasOrientation()) - shape_j.orientation = quat(texFetchScalar4(d_orientation, free_volume_orientation_tex, j)); + shape_j.orientation = quat(__ldg(d_orientation + j)); // put particle j into the coordinate system of particle i vec3 r_ij = vec3(postype_j) - pos_i; @@ -406,20 +398,6 @@ cudaError_t gpu_hpmc_free_volume(const hpmc_free_volume_args_t& args, const type assert(args.group_size <= 32); // note, really should be warp size of the device assert(args.block_size%(args.stride*args.group_size)==0); - - // bind the textures - free_volume_postype_tex.normalized = false; - free_volume_postype_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, free_volume_postype_tex, args.d_postype, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - - free_volume_orientation_tex.normalized = false; - free_volume_orientation_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, free_volume_orientation_tex, args.d_orientation, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - // reset counters cudaMemsetAsync(args.d_n_overlap_all,0, sizeof(unsigned int), args.stream); diff --git a/hoomd/hpmc/IntegratorHPMCMonoGPU.cu b/hoomd/hpmc/IntegratorHPMCMonoGPU.cu index 61baf204d5..132876b31a 100644 --- a/hoomd/hpmc/IntegratorHPMCMonoGPU.cu +++ b/hoomd/hpmc/IntegratorHPMCMonoGPU.cu @@ -59,7 +59,7 @@ __global__ void gpu_hpmc_excell_kernel(unsigned int *d_excell_idx, for (unsigned int k = 0; k < neigh_cell_size; k++) { // read in the index of the new particle to add to our cell - unsigned int new_idx = tex1Dfetch(cell_idx_tex, cli(k, neigh_cell)); + unsigned int new_idx = __ldg(d_cell_idx + cli(k, neigh_cell)); d_excell_idx[excli(my_cell_size, my_cell)] = new_idx; my_cell_size++; } @@ -100,13 +100,6 @@ cudaError_t gpu_hpmc_excell(unsigned int *d_excell_idx, dim3 threads(min(block_size, (unsigned int)max_block_size), 1, 1); dim3 grid(ci.getNumElements() / block_size + 1, 1, 1); - // bind the textures - cell_idx_tex.normalized = false; - cell_idx_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, cell_idx_tex, d_cell_idx, sizeof(unsigned int)*cli.getNumElements()); - if (error != cudaSuccess) - return error; - gpu_hpmc_excell_kernel<<>>(d_excell_idx, d_excell_size, excli, diff --git a/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh b/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh index 8762ec109b..d6bc445182 100644 --- a/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh +++ b/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh @@ -184,13 +184,6 @@ cudaError_t gpu_hpmc_shift(Scalar4 *d_postype, * Definition of function templates and templated GPU kernels */ -//! Texture for reading postype -scalar4_tex_t postype_tex; -//! Texture for reading orientation -scalar4_tex_t orientation_tex; -//! Texture for reading cell index data -texture cell_idx_tex; - //! Device function to compute the cell that a particle sits in __device__ inline unsigned int computeParticleCell(const Scalar3& p, const BoxDim& box, @@ -279,9 +272,6 @@ __device__ inline unsigned int computeParticleCell(const Scalar3& p, - threadIdx.x is the offset within the current group - blockIdx.x runs enough blocks so that all active cells are covered - **Possible enhancements** - - Use __ldg and not tex1Dfetch on sm35 - \ingroup hpmc_kernels */ template< class Shape > @@ -471,17 +461,17 @@ __global__ void gpu_hpmc_mpmc_kernel(Scalar4 *d_postype, // select one of the particles randomly from the cell unsigned int my_cell_offset = hoomd::UniformIntDistribution(my_cell_size-1)(rng); - i = tex1Dfetch(cell_idx_tex, cli(my_cell_offset, my_cell)); + i = __ldg(d_cell_idx + cli(my_cell_offset, my_cell)); // read in the position and orientation of our particle. - Scalar4 postype_i = texFetchScalar4(d_postype, postype_tex, i); + Scalar4 postype_i = __ldg(d_postype + i); Scalar4 orientation_i = make_scalar4(1,0,0,0); unsigned int typ_i = __scalar_as_int(postype_i.w); Shape shape_i(quat(orientation_i), s_params[typ_i]); if (shape_i.hasOrientation()) - orientation_i = texFetchScalar4(d_orientation, orientation_tex, i); + orientation_i = __ldg(d_orientation + i); shape_i.orientation = quat(orientation_i); @@ -549,11 +539,7 @@ __global__ void gpu_hpmc_mpmc_kernel(Scalar4 *d_postype, unsigned int j, next_j = 0; if (k < excell_size) { - #if (__CUDA_ARCH__ > 300) next_j = __ldg(&d_excell_idx[excli(k, my_cell)]); - #else - next_j = d_excell_idx[excli(k, my_cell)]; - #endif } // add to the queue as long as the queue is not full, and we have not yet reached the end of our own list @@ -577,15 +563,11 @@ __global__ void gpu_hpmc_mpmc_kernel(Scalar4 *d_postype, if (k < excell_size) { - #if (__CUDA_ARCH__ > 300) next_j = __ldg(&d_excell_idx[excli(k, my_cell)]); - #else - next_j = d_excell_idx[excli(k, my_cell)]; - #endif } // read in position, and orientation of neighboring particle - postype_j = texFetchScalar4(d_postype, postype_tex, j); + postype_j = __ldg(d_postype + j); Shape shape_j(quat(orientation_j), s_params[__scalar_as_int(postype_j.w)]); // put particle j into the coordinate system of particle i @@ -645,12 +627,12 @@ __global__ void gpu_hpmc_mpmc_kernel(Scalar4 *d_postype, Shape shape_i(quat(s_orientation_group[check_group]), s_params[type_i]); // build shape j from global memory - postype_j = texFetchScalar4(d_postype, postype_tex, check_j); + postype_j = __ldg(d_postype + check_j); orientation_j = make_scalar4(1,0,0,0); unsigned int type_j = __scalar_as_int(postype_j.w); Shape shape_j(quat(orientation_j), s_params[type_j]); if (shape_j.hasOrientation()) - shape_j.orientation = quat(texFetchScalar4(d_orientation, orientation_tex, check_j)); + shape_j.orientation = quat(__ldg(d_orientation + check_j)); // put particle j into the coordinate system of particle i r_ij = vec3(postype_j) - vec3(pos_i); @@ -881,28 +863,6 @@ cudaError_t gpu_hpmc_update(const hpmc_args_t& args, const typename Shape::param dim3 grid( args.n_active_cells / n_groups + 1, 1, 1); - // bind the textures - postype_tex.normalized = false; - postype_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, postype_tex, args.d_postype, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - - if (args.has_orientation) - { - orientation_tex.normalized = false; - orientation_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, orientation_tex, args.d_orientation, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - } - - cell_idx_tex.normalized = false; - cell_idx_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, cell_idx_tex, args.d_cell_idx, sizeof(Scalar4)*args.cli.getNumElements()); - if (error != cudaSuccess) - return error; - gpu_hpmc_mpmc_kernel<<>>(args.d_postype, args.d_orientation, args.d_counters, diff --git a/hoomd/hpmc/IntegratorHPMCMonoImplicitGPU.cuh b/hoomd/hpmc/IntegratorHPMCMonoImplicitGPU.cuh index 77c492de43..9deb61359e 100644 --- a/hoomd/hpmc/IntegratorHPMCMonoImplicitGPU.cuh +++ b/hoomd/hpmc/IntegratorHPMCMonoImplicitGPU.cuh @@ -231,18 +231,6 @@ cudaError_t gpu_hpmc_implicit_accept_reject(const hpmc_implicit_args_t &args, co * Definition of function templates and templated GPU kernels */ -//! Texture for reading postype -scalar4_tex_t implicit_postype_tex; -//! Texture for reading orientation -scalar4_tex_t implicit_orientation_tex; -//! Texture for reading postype -scalar4_tex_t implicit_postype_old_tex; -//! Texture for reading orientation -scalar4_tex_t implicit_orientation_old_tex; - -//! Texture for reading cell index data -texture implicit_cell_idx_tex; - //! HPMC implicit count overlaps kernel /*! \param d_postype Particle positions and types by index \param d_orientation Particle orientation @@ -425,7 +413,7 @@ __global__ void gpu_hpmc_implicit_count_overlaps_kernel(Scalar4 *d_postype, if (active) { // load updated particle position - postype_i = texFetchScalar4(d_postype, implicit_postype_tex, idx_i); + postype_i = __ldg(d_postype + idx_i); type_i = __scalar_as_int(postype_i.w); pos_i = vec3(postype_i); } @@ -458,7 +446,7 @@ __global__ void gpu_hpmc_implicit_count_overlaps_kernel(Scalar4 *d_postype, Shape shape_i(quat(orientation_i), s_params[__scalar_as_int(postype_i.w)]); if (shape_i.hasOrientation()) { - orientation_i = texFetchScalar4(d_orientation, implicit_orientation_tex, idx_i); + orientation_i = __ldg(d_orientation + idx_i); shape_i.orientation = quat(orientation_i); } @@ -528,18 +516,14 @@ __global__ void gpu_hpmc_implicit_count_overlaps_kernel(Scalar4 *d_postype, Scalar4 postype_j; do { // read in position, and orientation of neighboring particle - #if (__CUDA_ARCH__ > 300) j = __ldg(&d_excell_idx[excli(local_k, my_cell)]); - #else - j = d_excell_idx[excli(local_k, my_cell)]; - #endif // check against neighbor - postype_j = texFetchScalar4(d_postype_old, implicit_postype_old_tex, j); + postype_j = __ldg(d_postype_old + j); Shape shape_j(quat(), s_params[__scalar_as_int(postype_j.w)]); if (shape_j.hasOrientation()) { - shape_j.orientation = quat(texFetchScalar4(d_orientation_old, implicit_orientation_old_tex, j)); + shape_j.orientation = quat(__ldg(d_orientation_old + j)); } // test depletant in sphere around new particle position @@ -565,7 +549,7 @@ __global__ void gpu_hpmc_implicit_count_overlaps_kernel(Scalar4 *d_postype, Shape shape_j(quat(), s_params[typ_j]); if (shape_j.hasOrientation()) { - shape_j.orientation = quat(texFetchScalar4(d_orientation_old, implicit_orientation_old_tex, j)); + shape_j.orientation = quat(__ldg(d_orientation_old + j)); } if (s_check_overlaps[overlap_idx(depletant_type, typ_j)] @@ -833,12 +817,12 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype, if (active) { // load updated particle position - postype_i = texFetchScalar4(d_postype, implicit_postype_tex, idx_i); + postype_i = __ldg(d_postype + idx_i); Shape shape_i(quat(), s_params[__scalar_as_int(postype_i.w)]); if (shape_i.hasOrientation()) { - orientation_i = texFetchScalar4(d_orientation, implicit_orientation_tex, idx_i); + orientation_i = __ldg(d_orientation + idx_i); } } @@ -851,7 +835,7 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype, vec3 pos_i_old; if (active) { - pos_i_old = vec3(texFetchScalar4(d_postype_old, implicit_postype_old_tex, idx_i)); + pos_i_old = vec3(__ldg(d_postype_old + idx_i)); } unsigned int excell_size; @@ -984,7 +968,7 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype, vec3 r_ij = pos_i_old - pos_test; if (shape_i.hasOrientation()) { - shape_i.orientation = quat(texFetchScalar4(d_orientation_old, implicit_orientation_old_tex, idx_i)); + shape_i.orientation = quat(__ldg(d_orientation_old + idx_i)); } // if depletant can be inserted in excluded volume at old (new) position, success @@ -1056,11 +1040,7 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype, unsigned int j, next_j = 0; if (k < excell_size) { - #if (__CUDA_ARCH__ > 300) next_j = __ldg(&d_excell_idx[excli(k, my_cell)]); - #else - next_j = d_excell_idx[excli(k, my_cell)]; - #endif } // add to the queue as long as the queue is not full, and we have not yet reached the end of our own list @@ -1082,15 +1062,11 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype, if (k < excell_size) { - #if (__CUDA_ARCH__ > 300) next_j = __ldg(&d_excell_idx[excli(k, my_cell)]); - #else - next_j = d_excell_idx[excli(k, my_cell)]; - #endif } // read in position, and orientation of neighboring particle - postype_j = texFetchScalar4(d_postype_old, implicit_postype_old_tex, j); + postype_j = __ldg(d_postype_old + j); Shape shape_j(quat(), s_params[__scalar_as_int(postype_j.w)]); // put particle j into the coordinate system of depletant @@ -1148,12 +1124,12 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype, Shape shape_i(quat(s_orientation_group[check_group]), s_params[depletant_type]); // build shape j from global memory - postype_j = texFetchScalar4(d_postype_old, implicit_postype_old_tex, check_j); + postype_j = __ldg(d_postype_old + check_j); orientation_j = make_scalar4(1,0,0,0); unsigned int typ_j = __scalar_as_int(postype_j.w); Shape shape_j(quat(orientation_j), s_params[typ_j]); if (shape_j.hasOrientation()) - shape_j.orientation = quat(texFetchScalar4(d_orientation_old, implicit_orientation_old_tex, check_j)); + shape_j.orientation = quat(__ldg(d_orientation_old + check_j)); // put particle j into the coordinate system of particle i r_ij = vec3(postype_j) - vec3(pos_i); @@ -1409,40 +1385,6 @@ struct CountOverlapsKernelLauncher n_active_cells = args.n_active_cells; } - // bind the textures - implicit_postype_tex.normalized = false; - implicit_postype_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, implicit_postype_tex, args.d_postype, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - - implicit_postype_old_tex.normalized = false; - implicit_postype_old_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, implicit_postype_old_tex, args.d_postype_old, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - - if (args.has_orientation) - { - implicit_orientation_tex.normalized = false; - implicit_orientation_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, implicit_orientation_tex, args.d_orientation, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - - implicit_orientation_old_tex.normalized = false; - implicit_orientation_old_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, implicit_orientation_old_tex, args.d_orientation_old, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - } - - implicit_cell_idx_tex.normalized = false; - implicit_cell_idx_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, implicit_cell_idx_tex, args.d_cell_idx, sizeof(unsigned int)*args.cli.getNumElements()); - if (error != cudaSuccess) - return error; - unsigned int shared_bytes = args.num_types * (sizeof(typename Shape::param_type)) + args.overlap_idx.getNumElements() * sizeof(unsigned int); diff --git a/hoomd/hpmc/IntegratorHPMCMonoImplicitNewGPU.cuh b/hoomd/hpmc/IntegratorHPMCMonoImplicitNewGPU.cuh index 78666a505d..c75c49452a 100644 --- a/hoomd/hpmc/IntegratorHPMCMonoImplicitNewGPU.cuh +++ b/hoomd/hpmc/IntegratorHPMCMonoImplicitNewGPU.cuh @@ -188,15 +188,6 @@ cudaError_t gpu_hpmc_implicit_accept_reject_new(const hpmc_implicit_args_new_t & * Definition of function templates and templated GPU kernels */ -//! Texture for reading postype -scalar4_tex_t depletants_postype_tex; -//! Texture for reading orientation -scalar4_tex_t depletants_orientation_tex; -//! Texture for reading postype -scalar4_tex_t depletants_postype_old_tex; -//! Texture for reading orientation -scalar4_tex_t depletants_orientation_old_tex; - template< class Shape > __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype, Scalar4 *d_orientation, @@ -347,7 +338,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype, if (i == UINT_MAX || !d_active_cell_accept[active_cell_idx]) return; // load updated particle position - Scalar4 postype_i = texFetchScalar4(d_postype, depletants_postype_tex, i); + Scalar4 postype_i = __ldg(d_postype + i); unsigned int type_i = __scalar_as_int(postype_i.w); curandState_t local_state; @@ -403,7 +394,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype, Scalar r = Scalar(0.5)*d_max*fast::pow(r3,Scalar(1.0/3.0)); // test depletant position around old configuration - Scalar4 postype_i_old = texFetchScalar4(d_postype_old, depletants_postype_old_tex, i); + Scalar4 postype_i_old = __ldg(d_postype_old + i); vec3 pos_test = vec3(postype_i_old)+r*n; if (shape_test.hasOrientation()) @@ -418,7 +409,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype, Shape shape_i(quat(orientation_i), s_params[type_i]); if (shape_i.hasOrientation()) { - orientation_i = texFetchScalar4(d_orientation, depletants_orientation_tex, i); + orientation_i = __ldg(d_orientation + i); shape_i.orientation = quat(orientation_i); } @@ -450,7 +441,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype, Shape shape_i_old(quat(quat(orientation_i_old)), d_params[type_i]); if (shape_i_old.hasOrientation()) { - orientation_i_old = texFetchScalar4(d_orientation_old, depletants_orientation_old_tex, i); + orientation_i_old = __ldg(d_orientation_old + i); shape_i_old.orientation = quat(orientation_i_old); } @@ -508,11 +499,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype, unsigned int j, next_j = 0; if (k < excell_size) { - #if (__CUDA_ARCH__ > 300) next_j = __ldg(&d_excell_idx[excli(k, my_cell)]); - #else - next_j = d_excell_idx[excli(k, my_cell)]; - #endif } // add to the queue as long as the queue is not full, and we have not yet reached the end of our own list @@ -536,15 +523,11 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype, if (k < excell_size) { - #if (__CUDA_ARCH__ > 300) next_j = __ldg(&d_excell_idx[excli(k, my_cell)]); - #else - next_j = d_excell_idx[excli(k, my_cell)]; - #endif } // read in position, and orientation of neighboring particle - postype_j = texFetchScalar4(d_postype, depletants_postype_tex, j); + postype_j = __ldg(d_postype + j); unsigned int type_j = __scalar_as_int(postype_j.w); Shape shape_j(quat(orientation_j), s_params[type_j]); @@ -603,12 +586,12 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype, Shape shape_test(quat(s_orientation_group[check_group]), s_params[depletant_type]); // build shape j from global memory - postype_j = texFetchScalar4(d_postype, depletants_postype_tex, check_j); + postype_j = __ldg(d_postype + check_j); orientation_j = make_scalar4(1,0,0,0); unsigned int type_j = __scalar_as_int(postype_j.w); Shape shape_j(quat(orientation_j), s_params[type_j]); if (shape_j.hasOrientation()) - shape_j.orientation = quat(texFetchScalar4(d_orientation, depletants_orientation_tex, check_j)); + shape_j.orientation = quat(__ldg(d_orientation + check_j)); // put particle j into the coordinate system of particle i r_ij = vec3(postype_j) - vec3(pos_test); @@ -813,34 +796,6 @@ cudaError_t gpu_hpmc_insert_depletants_queue(const hpmc_implicit_args_new_t& arg // 1 block per active cell dim3 grid( args.n_active_cells, 1, 1); - // bind the textures - depletants_postype_tex.normalized = false; - depletants_postype_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, depletants_postype_tex, args.d_postype, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - - depletants_postype_old_tex.normalized = false; - depletants_postype_old_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, depletants_postype_old_tex, args.d_postype_old, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - - if (args.has_orientation) - { - depletants_orientation_tex.normalized = false; - depletants_orientation_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, depletants_orientation_tex, args.d_orientation, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - - depletants_orientation_old_tex.normalized = false; - depletants_orientation_old_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, depletants_orientation_old_tex, args.d_orientation_old, sizeof(Scalar4)*args.max_n); - if (error != cudaSuccess) - return error; - } - // reset counters cudaMemsetAsync(args.d_overlap_cell,0, sizeof(unsigned int)*args.n_active_cells, args.stream); diff --git a/hoomd/md/AnisoPotentialPairGPU.cuh b/hoomd/md/AnisoPotentialPairGPU.cuh index d726f0e2b6..b0a8c0c738 100644 --- a/hoomd/md/AnisoPotentialPairGPU.cuh +++ b/hoomd/md/AnisoPotentialPairGPU.cuh @@ -118,17 +118,6 @@ struct a_pair_args_t }; #ifdef NVCC -//! Texture for reading particle positions -scalar4_tex_t aniso_pdata_pos_tex; - -//! Texture for reading particle quaternions -scalar4_tex_t aniso_pdata_quat_tex; - -//! Texture for reading particle diameters -scalar_tex_t aniso_pdata_diam_tex; - -//! Texture for reading particle charges -scalar_tex_t aniso_pdata_charge_tex; //! Kernel for calculating pair forces /*! This kernel is called to calculate the pair forces on all N particles. Actual evaluation of the potentials and @@ -223,18 +212,18 @@ __global__ void gpu_compute_pair_aniso_forces_kernel(Scalar4 *d_force, // read in the position of our particle. Texture reads of Scalar4's are faster than global reads on compute 1.0 hardware // (MEM TRANSFER: 16 bytes) - Scalar4 postypei = texFetchScalar4(d_pos, aniso_pdata_pos_tex, idx); + Scalar4 postypei = __ldg(d_pos + idx); Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z); - Scalar4 quati = texFetchScalar4(d_orientation,aniso_pdata_quat_tex, idx); + Scalar4 quati = __ldg(d_orientation + idx); Scalar di; if (evaluator::needsDiameter()) - di = texFetchScalar(d_diameter, aniso_pdata_diam_tex, idx); + di = __ldg(d_diameter + idx); else di += 1.0f; // shut up compiler warning Scalar qi; if (evaluator::needsCharge()) - qi = texFetchScalar(d_charge, aniso_pdata_charge_tex, idx); + qi = __ldg(d_charge + idx); else qi += 1.0f; // shut up compiler warning @@ -265,19 +254,19 @@ __global__ void gpu_compute_pair_aniso_forces_kernel(Scalar4 *d_force, next_j = d_nlist[myHead + neigh_idx + tpp]; // get the neighbor's position (MEM TRANSFER: 16 bytes) - Scalar4 postypej = texFetchScalar4(d_pos, aniso_pdata_pos_tex, cur_j); + Scalar4 postypej = __ldg(d_pos + cur_j); Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z); - Scalar4 quatj = texFetchScalar4(d_orientation, aniso_pdata_quat_tex, cur_j); + Scalar4 quatj = __ldg(d_orientation + cur_j); Scalar dj = 0.0f; if (evaluator::needsDiameter()) - dj = texFetchScalar(d_diameter, aniso_pdata_diam_tex, cur_j); + dj = __ldg(d_diameter + cur_j); else dj += 1.0f; // shut up compiler warning Scalar qj = 0.0f; if (evaluator::needsCharge()) - qj = texFetchScalar(d_charge, aniso_pdata_charge_tex, cur_j); + qj = __ldg(d_charge + cur_j); else qj += 1.0f; // shut up compiler warning @@ -413,28 +402,6 @@ int aniso_get_compute_capability(T func) return attr.binaryVersion; } -void gpu_pair_aniso_force_bind_textures(const a_pair_args_t pair_args) - { - // bind the position texture - aniso_pdata_pos_tex.normalized = false; - aniso_pdata_pos_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, aniso_pdata_pos_tex, pair_args.d_pos, sizeof(Scalar4)*pair_args.n_max); - - // bind the position texture - aniso_pdata_quat_tex.normalized = false; - aniso_pdata_quat_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, aniso_pdata_quat_tex, pair_args.d_orientation, sizeof(Scalar4)*pair_args.n_max); - - // bind the diameter texture - aniso_pdata_diam_tex.normalized = false; - aniso_pdata_diam_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, aniso_pdata_diam_tex, pair_args.d_diameter, sizeof(Scalar) * pair_args.n_max); - - aniso_pdata_charge_tex.normalized = false; - aniso_pdata_charge_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, aniso_pdata_charge_tex, pair_args.d_charge, sizeof(Scalar) * pair_args.n_max); - } - //! Kernel driver that computes lj forces on the GPU for LJForceComputeGPU /*! \param pair_args Other arguments to pass onto the kernel \param d_params Parameters for the potential, stored per type pair @@ -470,13 +437,8 @@ cudaError_t gpu_compute_pair_aniso_forces(const a_pair_args_t& pair_args, case 0: { static unsigned int max_block_size = UINT_MAX; - static unsigned int sm = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = aniso_get_max_block_size(gpu_compute_pair_aniso_forces_kernel); - if (sm == UINT_MAX) - sm = aniso_get_compute_capability(gpu_compute_pair_aniso_forces_kernel); - - if (sm < 35) gpu_pair_aniso_force_bind_textures(pair_args); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1, 1, 1); @@ -507,13 +469,8 @@ cudaError_t gpu_compute_pair_aniso_forces(const a_pair_args_t& pair_args, case 1: { static unsigned int max_block_size = UINT_MAX; - static unsigned int sm = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = aniso_get_max_block_size(gpu_compute_pair_aniso_forces_kernel); - if (sm == UINT_MAX) - sm = aniso_get_compute_capability(gpu_compute_pair_aniso_forces_kernel); - - if (sm < 35) gpu_pair_aniso_force_bind_textures(pair_args); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1, 1, 1); @@ -552,13 +509,9 @@ cudaError_t gpu_compute_pair_aniso_forces(const a_pair_args_t& pair_args, case 0: { static unsigned int max_block_size = UINT_MAX; - static unsigned int sm = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = aniso_get_max_block_size(gpu_compute_pair_aniso_forces_kernel); - if (sm == UINT_MAX) - sm = aniso_get_compute_capability(gpu_compute_pair_aniso_forces_kernel); - if (sm < 35) gpu_pair_aniso_force_bind_textures(pair_args); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1, 1, 1); @@ -589,21 +542,11 @@ cudaError_t gpu_compute_pair_aniso_forces(const a_pair_args_t& pair_args, case 1: { static unsigned int max_block_size = UINT_MAX; - static unsigned int sm = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = aniso_get_max_block_size(gpu_compute_pair_aniso_forces_kernel); - if (sm == UINT_MAX) - sm = aniso_get_compute_capability(gpu_compute_pair_aniso_forces_kernel); - - if (sm < 35) gpu_pair_aniso_force_bind_textures(pair_args); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1, 1, 1); - if (sm < 30 && grid.x > 65535) - { - grid.y = grid.x/65535 + 1; - grid.x = 65535; - } shared_bytes += sizeof(Scalar)*block_size; diff --git a/hoomd/md/BondTablePotentialGPU.cc b/hoomd/md/BondTablePotentialGPU.cc index e56d1f560c..cc3e25ba3e 100644 --- a/hoomd/md/BondTablePotentialGPU.cc +++ b/hoomd/md/BondTablePotentialGPU.cc @@ -93,8 +93,7 @@ void BondTablePotentialGPU::computeForces(unsigned int timestep) m_table_width, m_table_value, d_flags.data, - m_tuner->getParam(), - m_exec_conf->getComputeCapability()); + m_tuner->getParam()); } diff --git a/hoomd/md/BondTablePotentialGPU.cu b/hoomd/md/BondTablePotentialGPU.cu index 538f209e72..dcade60489 100644 --- a/hoomd/md/BondTablePotentialGPU.cu +++ b/hoomd/md/BondTablePotentialGPU.cu @@ -14,10 +14,6 @@ \brief Defines GPU kernel code for calculating the table bond forces. Used by BondTablePotentialGPU. */ - -//! Texture for reading table values -scalar2_tex_t tables_tex; - /*! This kernel is called to calculate the table pair forces on all N particles \param d_force Device memory to write computed forces @@ -35,10 +31,6 @@ scalar2_tex_t tables_tex; \param d_flags Flag allocated on the device for use in checking for bonds that cannot be evaluated See BondTablePotential for information on the memory layout. - - \b Details: - * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests - at a later date may result in this changing. */ __global__ void gpu_compute_bondtable_forces_kernel(Scalar4* d_force, Scalar* d_virial, @@ -124,8 +116,8 @@ __global__ void gpu_compute_bondtable_forces_kernel(Scalar4* d_force, // compute index into the table and read in values unsigned int value_i = floor(value_f); - Scalar2 VF0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_bond_type)); - Scalar2 VF1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_bond_type)); + Scalar2 VF0 = __ldg(d_tables + table_value(value_i, cur_bond_type)); + Scalar2 VF1 = __ldg(d_tables + table_value(value_i+1, cur_bond_type)); // unpack the data Scalar V0 = VF0.x; Scalar V1 = VF1.x; @@ -190,7 +182,6 @@ __global__ void gpu_compute_bondtable_forces_kernel(Scalar4* d_force, \param d_flags flags on the device - a 1 will be written if evaluation of forces failed for any bond \param block_size Block size at which to run the kernel - \param compute_capability Compute capability of the execution device (200, 3000, 350, ...) \note This is just a kernel driver. See gpu_compute_bondtable_forces_kernel for full documentation. */ @@ -209,8 +200,7 @@ cudaError_t gpu_compute_bondtable_forces(Scalar4* d_force, const unsigned int table_width, const Index2D &table_value, unsigned int *d_flags, - const unsigned int block_size, - const unsigned int compute_capability) + const unsigned int block_size) { assert(d_params); assert(d_tables); @@ -231,16 +221,6 @@ cudaError_t gpu_compute_bondtable_forces(Scalar4* d_force, dim3 grid( N / run_block_size + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the tables texture only on pre sm 35 arches - if (compute_capability < 350) - { - tables_tex.normalized = false; - tables_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_value.getNumElements()); - if (error != cudaSuccess) - return error; - } - gpu_compute_bondtable_forces_kernel<<< grid, threads, sizeof(Scalar4)*n_bond_type >>> (d_force, d_virial, diff --git a/hoomd/md/BondTablePotentialGPU.cuh b/hoomd/md/BondTablePotentialGPU.cuh index cb238c5729..b85b5bb4b2 100644 --- a/hoomd/md/BondTablePotentialGPU.cuh +++ b/hoomd/md/BondTablePotentialGPU.cuh @@ -32,7 +32,6 @@ cudaError_t gpu_compute_bondtable_forces(Scalar4* d_force, const unsigned int table_width, const Index2D &table_value, unsigned int *d_flags, - const unsigned int block_size, - const unsigned int compute_capability); + const unsigned int block_size); #endif diff --git a/hoomd/md/CosineSqAngleForceComputeGPU.cc b/hoomd/md/CosineSqAngleForceComputeGPU.cc index 7213e05ec0..0d5b3f0672 100644 --- a/hoomd/md/CosineSqAngleForceComputeGPU.cc +++ b/hoomd/md/CosineSqAngleForceComputeGPU.cc @@ -92,8 +92,7 @@ void CosineSqAngleForceComputeGPU::computeForces(unsigned int timestep) d_gpu_n_angles.data, d_params.data, m_angle_data->getNTypes(), - m_tuner->getParam(), - m_exec_conf->getComputeCapability()); + m_tuner->getParam()); if(m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); diff --git a/hoomd/md/CosineSqAngleForceGPU.cu b/hoomd/md/CosineSqAngleForceGPU.cu index 4e010140b8..1931d6b73d 100644 --- a/hoomd/md/CosineSqAngleForceGPU.cu +++ b/hoomd/md/CosineSqAngleForceGPU.cu @@ -16,9 +16,6 @@ CosineSqAngleForceComputeGPU. */ -//! Texture for reading angle parameters -scalar2_tex_t angle_params_tex; - //! Kernel for calculating cosine squared angle forces on the GPU /*! \param d_force Device memory to write computed forces \param d_virial Device memory to write computed virials @@ -115,7 +112,7 @@ extern "C" __global__ void gpu_compute_cosinesq_angle_forces_kernel(Scalar4* d_f dac = box.minImage(dac); // get the angle parameters (MEM TRANSFER: 8 bytes) - Scalar2 params = texFetchScalar2(d_params, angle_params_tex, cur_angle_type); + Scalar2 params = __ldg(d_params + cur_angle_type); Scalar K = params.x; Scalar t_0 = params.y; @@ -205,7 +202,6 @@ extern "C" __global__ void gpu_compute_cosinesq_angle_forces_kernel(Scalar4* d_f \param d_params K and t_0 params packed as Scalar2 variables \param n_angle_types Number of angle types in d_params \param block_size Block size to use when performing calculations - \param compute_capability Device compute capability (200, 300, 350, ...) \returns Any error code resulting from the kernel launch \note Always returns cudaSuccess in release builds to avoid the cudaThreadSynchronize() @@ -225,8 +221,7 @@ cudaError_t gpu_compute_cosinesq_angle_forces(Scalar4* d_force, const unsigned int *n_angles_list, Scalar2 *d_params, unsigned int n_angle_types, - int block_size, - const unsigned int compute_capability) + int block_size) { assert(d_params); @@ -244,14 +239,6 @@ cudaError_t gpu_compute_cosinesq_angle_forces(Scalar4* d_force, dim3 grid( N / run_block_size + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the texture on pre sm 35 arches - if (compute_capability < 350) - { - cudaError_t error = cudaBindTexture(0, angle_params_tex, d_params, sizeof(Scalar2) * n_angle_types); - if (error != cudaSuccess) - return error; - } - // run the kernel gpu_compute_cosinesq_angle_forces_kernel<<< grid, threads>>>( d_force, d_virial, virial_pitch, N, d_pos, d_params, box, diff --git a/hoomd/md/CosineSqAngleForceGPU.cuh b/hoomd/md/CosineSqAngleForceGPU.cuh index 05cfd03e4d..48d4895728 100644 --- a/hoomd/md/CosineSqAngleForceGPU.cuh +++ b/hoomd/md/CosineSqAngleForceGPU.cuh @@ -28,7 +28,6 @@ cudaError_t gpu_compute_cosinesq_angle_forces(Scalar4* d_force, const unsigned int *n_angles_list, Scalar2 *d_params, unsigned int n_angle_types, - int block_size, - const unsigned int compute_capability); + int block_size); #endif diff --git a/hoomd/md/HarmonicAngleForceComputeGPU.cc b/hoomd/md/HarmonicAngleForceComputeGPU.cc index 79070ae556..891316b0f0 100644 --- a/hoomd/md/HarmonicAngleForceComputeGPU.cc +++ b/hoomd/md/HarmonicAngleForceComputeGPU.cc @@ -94,8 +94,7 @@ void HarmonicAngleForceComputeGPU::computeForces(unsigned int timestep) d_gpu_n_angles.data, d_params.data, m_angle_data->getNTypes(), - m_tuner->getParam(), - m_exec_conf->getComputeCapability()); + m_tuner->getParam()); if(m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); diff --git a/hoomd/md/HarmonicAngleForceGPU.cu b/hoomd/md/HarmonicAngleForceGPU.cu index 7f1925721a..abca8c4eb0 100644 --- a/hoomd/md/HarmonicAngleForceGPU.cu +++ b/hoomd/md/HarmonicAngleForceGPU.cu @@ -16,9 +16,6 @@ \brief Defines GPU kernel code for calculating the harmonic angle forces. Used by HarmonicAngleForceComputeGPU. */ -//! Texture for reading angle parameters -scalar2_tex_t angle_params_tex; - //! Kernel for calculating harmonic angle forces on the GPU /*! \param d_force Device memory to write computed forces \param d_virial Device memory to write computed virials @@ -115,7 +112,7 @@ extern "C" __global__ void gpu_compute_harmonic_angle_forces_kernel(Scalar4* d_f dac = box.minImage(dac); // get the angle parameters (MEM TRANSFER: 8 bytes) - Scalar2 params = texFetchScalar2(d_params, angle_params_tex, cur_angle_type); + Scalar2 params = __ldg(d_params + cur_angle_type); Scalar K = params.x; Scalar t_0 = params.y; @@ -227,8 +224,7 @@ cudaError_t gpu_compute_harmonic_angle_forces(Scalar4* d_force, const unsigned int *n_angles_list, Scalar2 *d_params, unsigned int n_angle_types, - int block_size, - const unsigned int compute_capability) + int block_size) { assert(d_params); @@ -246,14 +242,6 @@ cudaError_t gpu_compute_harmonic_angle_forces(Scalar4* d_force, dim3 grid( N / run_block_size + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the texture on pre sm 35 arches - if (compute_capability < 350) - { - cudaError_t error = cudaBindTexture(0, angle_params_tex, d_params, sizeof(Scalar2) * n_angle_types); - if (error != cudaSuccess) - return error; - } - // run the kernel gpu_compute_harmonic_angle_forces_kernel<<< grid, threads>>>(d_force, d_virial, virial_pitch, N, d_pos, d_params, box, atable, apos_list, pitch, n_angles_list); diff --git a/hoomd/md/HarmonicAngleForceGPU.cuh b/hoomd/md/HarmonicAngleForceGPU.cuh index 49e3f4758f..a18b1eedd9 100644 --- a/hoomd/md/HarmonicAngleForceGPU.cuh +++ b/hoomd/md/HarmonicAngleForceGPU.cuh @@ -28,7 +28,6 @@ cudaError_t gpu_compute_harmonic_angle_forces(Scalar4* d_force, const unsigned int *n_angles_list, Scalar2 *d_params, unsigned int n_angle_types, - int block_size, - const unsigned int compute_capability); + int block_size); #endif diff --git a/hoomd/md/HarmonicDihedralForceComputeGPU.cc b/hoomd/md/HarmonicDihedralForceComputeGPU.cc index a9f594156a..13d61eaefb 100644 --- a/hoomd/md/HarmonicDihedralForceComputeGPU.cc +++ b/hoomd/md/HarmonicDihedralForceComputeGPU.cc @@ -93,8 +93,7 @@ void HarmonicDihedralForceComputeGPU::computeForces(unsigned int timestep) d_n_dihedrals.data, d_params.data, m_dihedral_data->getNTypes(), - this->m_tuner->getParam(), - m_exec_conf->getComputeCapability()); + this->m_tuner->getParam()); if(m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); this->m_tuner->end(); diff --git a/hoomd/md/HarmonicDihedralForceGPU.cu b/hoomd/md/HarmonicDihedralForceGPU.cu index 4c17f64365..0ddd15183b 100644 --- a/hoomd/md/HarmonicDihedralForceGPU.cu +++ b/hoomd/md/HarmonicDihedralForceGPU.cu @@ -22,9 +22,6 @@ \brief Defines GPU kernel code for calculating the harmonic dihedral forces. Used by HarmonicDihedralForceComputeGPU. */ -//! Texture for reading dihedral parameters -scalar4_tex_t dihedral_params_tex; - //! Kernel for calculating harmonic dihedral forces on the GPU /*! \param d_force Device memory to write computed forces \param d_virial Device memory to write computed virials @@ -137,7 +134,7 @@ void gpu_compute_harmonic_dihedral_forces_kernel(Scalar4* d_force, dcbm = box.minImage(dcbm); // get the dihedral parameters (MEM TRANSFER: 12 bytes) - Scalar4 params = texFetchScalar4(d_params, dihedral_params_tex, cur_dihedral_type); + Scalar4 params = __ldg(d_params + cur_dihedral_type); Scalar K = params.x; Scalar sign = params.y; Scalar multi = params.z; @@ -319,8 +316,7 @@ cudaError_t gpu_compute_harmonic_dihedral_forces(Scalar4* d_force, const unsigned int *n_dihedrals_list, Scalar4 *d_params, unsigned int n_dihedral_types, - int block_size, - const unsigned int compute_capability) + int block_size) { assert(d_params); @@ -338,14 +334,6 @@ cudaError_t gpu_compute_harmonic_dihedral_forces(Scalar4* d_force, dim3 grid( N / run_block_size + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the texture on pre sm35 devices - if (compute_capability < 350) - { - cudaError_t error = cudaBindTexture(0, dihedral_params_tex, d_params, sizeof(Scalar4) * n_dihedral_types); - if (error != cudaSuccess) - return error; - } - // run the kernel gpu_compute_harmonic_dihedral_forces_kernel<<< grid, threads>>>(d_force, d_virial, virial_pitch, N, d_pos, d_params, box, tlist, dihedral_ABCD, pitch, n_dihedrals_list); diff --git a/hoomd/md/HarmonicDihedralForceGPU.cuh b/hoomd/md/HarmonicDihedralForceGPU.cuh index a375c1507f..60276df35f 100644 --- a/hoomd/md/HarmonicDihedralForceGPU.cuh +++ b/hoomd/md/HarmonicDihedralForceGPU.cuh @@ -28,7 +28,6 @@ cudaError_t gpu_compute_harmonic_dihedral_forces(Scalar4* d_force, const unsigned int *n_dihedrals_list, Scalar4 *d_params, unsigned int n_dihedral_types, - int block_size, - const unsigned int compute_capability); + int block_size); #endif diff --git a/hoomd/md/HarmonicImproperForceComputeGPU.cc b/hoomd/md/HarmonicImproperForceComputeGPU.cc index a27b88cf7a..ef768358e2 100644 --- a/hoomd/md/HarmonicImproperForceComputeGPU.cc +++ b/hoomd/md/HarmonicImproperForceComputeGPU.cc @@ -91,8 +91,7 @@ void HarmonicImproperForceComputeGPU::computeForces(unsigned int timestep) d_n_dihedrals.data, d_params.data, m_improper_data->getNTypes(), - m_tuner->getParam(), - m_exec_conf->getComputeCapability()); + m_tuner->getParam()); if(m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); m_tuner->end(); diff --git a/hoomd/md/HarmonicImproperForceGPU.cu b/hoomd/md/HarmonicImproperForceGPU.cu index 969e0c7da8..89c896e706 100644 --- a/hoomd/md/HarmonicImproperForceGPU.cu +++ b/hoomd/md/HarmonicImproperForceGPU.cu @@ -16,9 +16,6 @@ \brief Defines GPU kernel code for calculating the harmonic improper forces. Used by HarmonicImproperForceComputeGPU. */ -//! Texture for reading improper parameters -scalar2_tex_t improper_params_tex; - //! Kernel for calculating harmonic improper forces on the GPU /*! \param d_force Device memory to write computed forces \param d_virial Device memory to write computed virials @@ -129,7 +126,7 @@ void gpu_compute_harmonic_improper_forces_kernel(Scalar4* d_force, ddc = box.minImage(ddc); // get the improper parameters (MEM TRANSFER: 12 bytes) - Scalar2 params = texFetchScalar2(d_params, improper_params_tex, cur_improper_type); + Scalar2 params = __ldg(d_params + cur_improper_type); Scalar K = params.x; Scalar chi = params.y; @@ -281,8 +278,7 @@ cudaError_t gpu_compute_harmonic_improper_forces(Scalar4* d_force, const unsigned int *n_dihedrals_list, Scalar2 *d_params, unsigned int n_improper_types, - int block_size, - const unsigned int compute_capability) + int block_size) { assert(d_params); @@ -303,14 +299,6 @@ cudaError_t gpu_compute_harmonic_improper_forces(Scalar4* d_force, dim3 grid( N / run_block_size + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the texture on pre sm35 devices - if (compute_capability < 350) - { - cudaError_t error = cudaBindTexture(0, improper_params_tex, d_params, sizeof(Scalar2) * n_improper_types); - if (error != cudaSuccess) - return error; - } - // run the kernel gpu_compute_harmonic_improper_forces_kernel<<< grid, threads>>>(d_force, d_virial, virial_pitch, N, d_pos, d_params, box, tlist, dihedral_ABCD, pitch, n_dihedrals_list); diff --git a/hoomd/md/HarmonicImproperForceGPU.cuh b/hoomd/md/HarmonicImproperForceGPU.cuh index 7c4dc4fdb1..2a248bf75a 100644 --- a/hoomd/md/HarmonicImproperForceGPU.cuh +++ b/hoomd/md/HarmonicImproperForceGPU.cuh @@ -28,7 +28,6 @@ cudaError_t gpu_compute_harmonic_improper_forces(Scalar4* d_force, const unsigned int *n_dihedrals_list, Scalar2 *d_params, unsigned int n_improper_types, - int block_size, - const unsigned int compute_capability); + int block_size); #endif diff --git a/hoomd/md/NeighborListGPUBinned.cu b/hoomd/md/NeighborListGPUBinned.cu index f33c5f8cc3..de3f6e799a 100644 --- a/hoomd/md/NeighborListGPUBinned.cu +++ b/hoomd/md/NeighborListGPUBinned.cu @@ -12,9 +12,6 @@ \brief Defines GPU kernel code for O(N) neighbor list generation on the GPU */ -//! Texture for reading d_cell_xyzf -scalar4_tex_t cell_xyzf_1d_tex; - //! Kernel call for generating neighbor list on the GPU (Kepler optimized version) /*! \tparam flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering. \param d_nlist Neighbor list data structure to write @@ -196,7 +193,7 @@ __global__ void gpu_compute_nlist_binned_kernel(unsigned int *d_nlist, unsigned int j; Scalar4 postype_j; if (!use_index) - cur_xyzf = texFetchScalar4(d_cell_xyzf, cell_xyzf_1d_tex, cli(cur_offset, neigh_cell)); + cur_xyzf = __ldg(d_cell_xyzf + cli(cur_offset, neigh_cell)); else { j = d_cell_idx[cli(cur_offset, neigh_cell)+igpu*cli.getNumElements()]; @@ -300,14 +297,6 @@ int get_max_block_size(T func) return max_threads; } -void gpu_nlist_binned_bind_texture(const Scalar4 *d_cell_xyzf, unsigned int n_elements) - { - // bind the position texture - cell_xyzf_1d_tex.normalized = false; - cell_xyzf_1d_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, cell_xyzf_1d_tex, d_cell_xyzf, sizeof(Scalar4)*n_elements); - } - //! recursive template to launch neighborlist with given template parameters /* \tparam cur_tpp Number of threads per particle (assumed to be power of two) */ template @@ -359,7 +348,6 @@ inline void launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<0,0,cur_tpp>); - if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements()); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1); @@ -396,7 +384,6 @@ inline void launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<1,0,cur_tpp>); - if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements()); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1); @@ -433,7 +420,6 @@ inline void launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<2,0,cur_tpp>); - if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements()); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1); @@ -470,7 +456,6 @@ inline void launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<3,0,cur_tpp>); - if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements()); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1); @@ -510,7 +495,6 @@ inline void launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<0,1,cur_tpp>); - if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements()); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1); @@ -547,7 +531,6 @@ inline void launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<1,1,cur_tpp>); - if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements()); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1); @@ -584,7 +567,6 @@ inline void launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<2,1,cur_tpp>); - if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements()); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1); @@ -621,7 +603,6 @@ inline void launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<3,1,cur_tpp>); - if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements()); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(nwork / (block_size/tpp) + 1); diff --git a/hoomd/md/NeighborListGPUStencil.cc b/hoomd/md/NeighborListGPUStencil.cc index c9a06f5ce9..7413326aae 100644 --- a/hoomd/md/NeighborListGPUStencil.cc +++ b/hoomd/md/NeighborListGPUStencil.cc @@ -312,8 +312,7 @@ void NeighborListGPUStencil::buildNlist(unsigned int timestep) m_filter_body, m_diameter_shift, threads_per_particle, - block_size, - m_exec_conf->getComputeCapability()/10); + block_size); if(m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); if (tune) this->m_tuner->end(); diff --git a/hoomd/md/NeighborListGPUStencil.cu b/hoomd/md/NeighborListGPUStencil.cu index 1fb9318c74..48e037c655 100644 --- a/hoomd/md/NeighborListGPUStencil.cu +++ b/hoomd/md/NeighborListGPUStencil.cu @@ -13,15 +13,6 @@ \brief Defines GPU kernel code for O(N) neighbor list generation on the GPU with multiple bin stencils */ -//! Texture for reading d_cell_xyzf -scalar4_tex_t cell_xyzf_1d_tex; - -//! Texture for reading d_cell_tdb -scalar4_tex_t cell_tdb_1d_tex; - -//! Texture for reading d_stencil -scalar4_tex_t stencil_1d_tex; - //! Kernel call for generating neighbor list on the GPU using multiple stencils (Kepler optimized version) /*! \tparam flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering. \tparam threads_per_particle Number of threads cooperatively computing the neighbor list @@ -179,7 +170,7 @@ __global__ void gpu_compute_nlist_stencil_kernel(unsigned int *d_nlist, if (cur_adj < n_stencil) { // compute the stenciled cell cartesian coordinates - Scalar4 stencil = texFetchScalar4(d_stencil, stencil_1d_tex, stencil_idx(cur_adj, my_type)); + Scalar4 stencil = __ldg(d_stencil + stencil_idx(cur_adj, my_type)); int sib = ib + __scalar_as_int(stencil.x); int sjb = jb + __scalar_as_int(stencil.y); int skb = kb + __scalar_as_int(stencil.z); @@ -212,7 +203,7 @@ __global__ void gpu_compute_nlist_stencil_kernel(unsigned int *d_nlist, do { // read in the particle type (diameter and body as well while we've got the Scalar4 in) - const Scalar4 neigh_tdb = texFetchScalar4(d_cell_tdb, cell_tdb_1d_tex, cli(cur_offset, neigh_cell)); + const Scalar4 neigh_tdb = __ldg(d_cell_tdb + cli(cur_offset, neigh_cell)); const unsigned int type_j = __scalar_as_int(neigh_tdb.x); const Scalar diam_j = neigh_tdb.y; const unsigned int body_j = __scalar_as_int(neigh_tdb.z); @@ -237,7 +228,7 @@ __global__ void gpu_compute_nlist_stencil_kernel(unsigned int *d_nlist, if (cell_dist2 > r_listsq) break; // only load in the particle position and id if distance check is required - const Scalar4 neigh_xyzf = texFetchScalar4(d_cell_xyzf, cell_xyzf_1d_tex, cli(cur_offset, neigh_cell)); + const Scalar4 neigh_xyzf = __ldg(d_cell_xyzf + cli(cur_offset, neigh_cell)); const Scalar3 neigh_pos = make_scalar3(neigh_xyzf.x, neigh_xyzf.y, neigh_xyzf.z); unsigned int cur_neigh = __scalar_as_int(neigh_xyzf.w); @@ -301,36 +292,6 @@ int get_max_block_size_stencil(T func) return max_threads; } -//! Bind the textures on sm <= 30 -/*! - * \param d_cell_xyzf Cell list particle array - * \param d_cell_tdb Cell list type-diameter-body array - * \param n_elements Number of elements in the cell list arrays - * \param d_stencil Stencil offset array - * \param n_stencil_elements Number of elements in the stencil offset array - */ -void gpu_nlist_stencil_bind_texture(const Scalar4 *d_cell_xyzf, - const Scalar4 *d_cell_tdb, - unsigned int n_elements, - const Scalar4 *d_stencil, - unsigned int n_stencil_elements) - { - // bind the position texture - cell_xyzf_1d_tex.normalized = false; - cell_xyzf_1d_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, cell_xyzf_1d_tex, d_cell_xyzf, sizeof(Scalar4)*n_elements); - - // bind the position texture - cell_tdb_1d_tex.normalized = false; - cell_tdb_1d_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, cell_tdb_1d_tex, d_cell_tdb, sizeof(Scalar4)*n_elements); - - // bind the stencil texture - stencil_1d_tex.normalized = false; - stencil_1d_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, stencil_1d_tex, d_stencil, sizeof(Scalar4)*n_stencil_elements); - } - //! recursive template to launch neighborlist with given template parameters /* \tparam cur_tpp Number of threads per particle (assumed to be power of two) */ template @@ -361,8 +322,7 @@ inline void stencil_launcher(unsigned int *d_nlist, bool filter_body, bool diameter_shift, const unsigned int threads_per_particle, - const unsigned int block_size, - const unsigned int compute_capability) + const unsigned int block_size) { // shared memory = r_listsq + Nmax + stuff needed for neighborlist (computed below) Index2D typpair_idx(ntypes); @@ -375,11 +335,6 @@ inline void stencil_launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size_stencil(gpu_compute_nlist_stencil_kernel<0,cur_tpp>); - if (compute_capability < 35) gpu_nlist_stencil_bind_texture(d_cell_xyzf, - d_cell_tdb, - cli.getNumElements(), - d_stencil, - stencil_idx.getNumElements()); unsigned int run_block_size = (block_size < max_block_size) ? block_size : max_block_size; dim3 grid(N / (block_size/threads_per_particle) + 1); @@ -413,11 +368,6 @@ inline void stencil_launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size_stencil(gpu_compute_nlist_stencil_kernel<1,cur_tpp>); - if (compute_capability < 35) gpu_nlist_stencil_bind_texture(d_cell_xyzf, - d_cell_tdb, - cli.getNumElements(), - d_stencil, - stencil_idx.getNumElements()); unsigned int run_block_size = (block_size < max_block_size) ? block_size : max_block_size; dim3 grid(N / (block_size/threads_per_particle) + 1); @@ -451,11 +401,6 @@ inline void stencil_launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size_stencil(gpu_compute_nlist_stencil_kernel<2,cur_tpp>); - if (compute_capability < 35) gpu_nlist_stencil_bind_texture(d_cell_xyzf, - d_cell_tdb, - cli.getNumElements(), - d_stencil, - stencil_idx.getNumElements()); unsigned int run_block_size = (block_size < max_block_size) ? block_size : max_block_size; dim3 grid(N / (block_size/threads_per_particle) + 1); @@ -489,11 +434,6 @@ inline void stencil_launcher(unsigned int *d_nlist, static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) max_block_size = get_max_block_size_stencil(gpu_compute_nlist_stencil_kernel<3,cur_tpp>); - if (compute_capability < 35) gpu_nlist_stencil_bind_texture(d_cell_xyzf, - d_cell_tdb, - cli.getNumElements(), - d_stencil, - stencil_idx.getNumElements()); unsigned int run_block_size = (block_size < max_block_size) ? block_size : max_block_size; dim3 grid(N / (block_size/threads_per_particle) + 1); @@ -552,8 +492,7 @@ inline void stencil_launcher(unsigned int *d_nlist, filter_body, diameter_shift, threads_per_particle, - block_size, - compute_capability); + block_size); } } @@ -586,8 +525,7 @@ inline void stencil_launcher(unsigned int *d_nlist, bool filter_body, bool diameter_shift, const unsigned int threads_per_particle, - const unsigned int block_size, - const unsigned int compute_capability) + const unsigned int block_size) { } cudaError_t gpu_compute_nlist_stencil(unsigned int *d_nlist, @@ -617,8 +555,7 @@ cudaError_t gpu_compute_nlist_stencil(unsigned int *d_nlist, bool filter_body, bool diameter_shift, const unsigned int threads_per_particle, - const unsigned int block_size, - const unsigned int compute_capability) + const unsigned int block_size) { stencil_launcher(d_nlist, d_n_neigh, @@ -647,8 +584,7 @@ cudaError_t gpu_compute_nlist_stencil(unsigned int *d_nlist, filter_body, diameter_shift, threads_per_particle, - block_size, - compute_capability); + block_size); return cudaSuccess; } diff --git a/hoomd/md/NeighborListGPUStencil.cuh b/hoomd/md/NeighborListGPUStencil.cuh index 8344c3f20f..63b9387973 100644 --- a/hoomd/md/NeighborListGPUStencil.cuh +++ b/hoomd/md/NeighborListGPUStencil.cuh @@ -49,8 +49,7 @@ cudaError_t gpu_compute_nlist_stencil(unsigned int *d_nlist, bool filter_body, bool diameter_shift, const unsigned int threads_per_particle, - const unsigned int block_size, - const unsigned int compute_capability); + const unsigned int block_size); //! Kernel driver for filling the particle types for sorting cudaError_t gpu_compute_nlist_stencil_fill_types(unsigned int *d_pids, diff --git a/hoomd/md/NeighborListGPUTree.cc b/hoomd/md/NeighborListGPUTree.cc index 774b716a10..6c6175f369 100644 --- a/hoomd/md/NeighborListGPUTree.cc +++ b/hoomd/md/NeighborListGPUTree.cc @@ -809,7 +809,6 @@ void NeighborListGPUTree::traverseTree() m_pdata->getNTypes(), m_filter_body, m_diameter_shift, - m_exec_conf->getComputeCapability()/10, m_tuner_traverse->getParam()); if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); diff --git a/hoomd/md/NeighborListGPUTree.cu b/hoomd/md/NeighborListGPUTree.cu index 291f8620de..7c244f50ea 100644 --- a/hoomd/md/NeighborListGPUTree.cu +++ b/hoomd/md/NeighborListGPUTree.cu @@ -16,18 +16,6 @@ \brief Defines GPU kernel code for neighbor list tree traversal on the GPU */ -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; -//! Texture for reading leaf data -scalar4_tex_t leaf_xyzf_tex; -//! Texture for the diameter / body -scalar2_tex_t leaf_db_tex; -//! Texture for reading node upper and lower bounds -scalar4_tex_t aabb_node_bounds_tex; -//! Texture for the head list -texture head_list_tex; - - //!< Expands a 10-bit integer into 30 bits by inserting 2 zeros after each bit. /*! * \param v unsigned integer with 10 bits set @@ -1025,16 +1013,16 @@ __global__ void gpu_nlist_traverse_tree_kernel(unsigned int *d_nlist, if (my_pidx >= N) return; - const Scalar4 postype_i = texFetchScalar4(d_pos, pdata_pos_tex, my_pidx); + const Scalar4 postype_i = __ldg(d_pos + my_pidx); const Scalar3 pos_i = make_scalar3(postype_i.x, postype_i.y, postype_i.z); const unsigned int type_i = __scalar_as_int(postype_i.w); // fetch the diameter and body out of the leaf texture since it's bound anyway - const Scalar2 db_i = texFetchScalar2(d_leaf_db, leaf_db_tex, idx); + const Scalar2 db_i = __ldg(d_leaf_db + idx); const Scalar diam_i = db_i.x; const unsigned int body_i = __scalar_as_int(db_i.y); - const unsigned int nlist_head_i = texFetchUint(d_head_list, head_list_tex, my_pidx); + const unsigned int nlist_head_i = __ldg(d_head_list + my_pidx); unsigned int n_neigh_i = 0; for (unsigned int cur_pair_type=0; cur_pair_type < ntypes; ++cur_pair_type) @@ -1073,8 +1061,8 @@ __global__ void gpu_nlist_traverse_tree_kernel(unsigned int *d_nlist, int cur_node_idx = cur_tree_root; while (cur_node_idx > -1) { - const Scalar4 upper_rope = texFetchScalar4(d_tree_aabbs, aabb_node_bounds_tex, 2*cur_node_idx); - const Scalar4 lower_np = texFetchScalar4(d_tree_aabbs, aabb_node_bounds_tex, 2*cur_node_idx+1); + const Scalar4 upper_rope = __ldg(d_tree_aabbs + 2*cur_node_idx); + const Scalar4 lower_np = __ldg(d_tree_aabbs + 2*cur_node_idx+1); if (!(aabb_upper.x < lower_np.x || aabb_lower.x > upper_rope.x @@ -1094,11 +1082,11 @@ __global__ void gpu_nlist_traverse_tree_kernel(unsigned int *d_nlist, for (unsigned int cur_p = node_head; cur_p < node_head + n_part; ++cur_p) { // neighbor j - const Scalar4 cur_xyzf = texFetchScalar4(d_leaf_xyzf, leaf_xyzf_tex, cur_p); + const Scalar4 cur_xyzf = __ldg(d_leaf_xyzf + cur_p); const Scalar3 pos_j = make_scalar3(cur_xyzf.x, cur_xyzf.y, cur_xyzf.z); const unsigned int j = __scalar_as_int(cur_xyzf.w); - const Scalar2 cur_db = texFetchScalar2(d_leaf_db, leaf_db_tex, cur_p); + const Scalar2 cur_db = __ldg(d_leaf_db + cur_p); const Scalar diam_j = cur_db.x; const unsigned int body_j = __scalar_as_int(cur_db.y); @@ -1223,47 +1211,12 @@ cudaError_t gpu_nlist_traverse_tree(unsigned int *d_nlist, const unsigned int ntypes, bool filter_body, bool diameter_shift, - const unsigned int compute_capability, const unsigned int block_size) { // shared memory = r_list + Nmax Index2D typpair_idx(ntypes); unsigned int shared_size = sizeof(Scalar)*typpair_idx.getNumElements() + 2*sizeof(unsigned int)*ntypes; - // bind the neighborlist texture - if (compute_capability < 35) - { - pdata_pos_tex.normalized = false; - pdata_pos_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*(N+nghosts)); - if (error != cudaSuccess) - return error; - - leaf_xyzf_tex.normalized = false; - leaf_xyzf_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, leaf_xyzf_tex, d_leaf_xyzf, sizeof(Scalar4)*(N+nghosts)); - if (error != cudaSuccess) - return error; - - leaf_db_tex.normalized = false; - leaf_db_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, leaf_db_tex, d_leaf_db, sizeof(Scalar2)*(N+nghosts)); - if (error != cudaSuccess) - return error; - - aabb_node_bounds_tex.normalized = false; - aabb_node_bounds_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, aabb_node_bounds_tex, d_tree_aabbs, sizeof(Scalar4)*2*nnodes); - if (error != cudaSuccess) - return error; - - head_list_tex.normalized = false; - head_list_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, head_list_tex, d_head_list, sizeof(unsigned int)*N); - if (error != cudaSuccess) - return error; - } - if (!filter_body && !diameter_shift) { static unsigned int max_block_size = UINT_MAX; @@ -1405,30 +1358,6 @@ cudaError_t gpu_nlist_traverse_tree(unsigned int *d_nlist, ntypes); } - // unbind the textures - if (compute_capability < 35) - { - cudaError_t error = cudaUnbindTexture(pdata_pos_tex); - if (error != cudaSuccess) - return error; - - error = cudaUnbindTexture(leaf_xyzf_tex); - if (error != cudaSuccess) - return error; - - error = cudaUnbindTexture(leaf_db_tex); - if (error != cudaSuccess) - return error; - - error = cudaUnbindTexture(aabb_node_bounds_tex); - if (error != cudaSuccess) - return error; - - error = cudaUnbindTexture(head_list_tex); - if (error != cudaSuccess) - return error; - } - return cudaSuccess; } diff --git a/hoomd/md/NeighborListGPUTree.cuh b/hoomd/md/NeighborListGPUTree.cuh index 0b2ed903cf..b4c4b011ca 100644 --- a/hoomd/md/NeighborListGPUTree.cuh +++ b/hoomd/md/NeighborListGPUTree.cuh @@ -120,7 +120,6 @@ cudaError_t gpu_nlist_traverse_tree(unsigned int *d_nlist, const unsigned int ntypes, bool filter_body, bool diameter_shift, - const unsigned int compute_capability, const unsigned int block_size); //! Kernel driver to initialize counting for types and nodes diff --git a/hoomd/md/OPLSDihedralForceComputeGPU.cc b/hoomd/md/OPLSDihedralForceComputeGPU.cc index 8fe6994653..33f2668cd8 100644 --- a/hoomd/md/OPLSDihedralForceComputeGPU.cc +++ b/hoomd/md/OPLSDihedralForceComputeGPU.cc @@ -69,8 +69,7 @@ void OPLSDihedralForceComputeGPU::computeForces(unsigned int timestep) d_n_dihedrals.data, d_params.data, m_dihedral_data->getNTypes(), - this->m_tuner->getParam(), - m_exec_conf->getComputeCapability()); + this->m_tuner->getParam()); if(m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); this->m_tuner->end(); diff --git a/hoomd/md/OPLSDihedralForceGPU.cu b/hoomd/md/OPLSDihedralForceGPU.cu index c6fc68ec7d..307d47422a 100644 --- a/hoomd/md/OPLSDihedralForceGPU.cu +++ b/hoomd/md/OPLSDihedralForceGPU.cu @@ -13,9 +13,6 @@ \brief Defines GPU kernel code for calculating OPLS dihedral forces. Used by OPLSDihedralForceComputeGPU. */ -//! Texture for reading dihedral parameters -scalar4_tex_t dihedral_params_tex; - //! Kernel for calculating OPLS dihedral forces on the GPU /*! \param d_force Device memory to write computed forces \param d_virial Device memory to write computed virials @@ -159,7 +156,7 @@ void gpu_compute_opls_dihedral_forces_kernel(Scalar4* d_force, // get values for k1/2 through k4/2 (MEM TRANSFER: 16 bytes) // ----- The 1/2 factor is already stored in the parameters -------- - Scalar4 params = texFetchScalar4(d_params, dihedral_params_tex, cur_dihedral_type); + Scalar4 params = __ldg(d_params + cur_dihedral_type); Scalar k1 = params.x; Scalar k2 = params.y; Scalar k3 = params.z; @@ -321,8 +318,7 @@ cudaError_t gpu_compute_opls_dihedral_forces(Scalar4* d_force, const unsigned int *n_dihedrals_list, const Scalar4 *d_params, const unsigned int n_dihedral_types, - const int block_size, - const unsigned int compute_capability) + const int block_size) { assert(d_params); @@ -340,14 +336,6 @@ cudaError_t gpu_compute_opls_dihedral_forces(Scalar4* d_force, dim3 grid( N / run_block_size + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the texture on pre sm35 devices - if (compute_capability < 350) - { - cudaError_t error = cudaBindTexture(0, dihedral_params_tex, d_params, sizeof(Scalar4) * n_dihedral_types); - if (error != cudaSuccess) - return error; - } - // run the kernel gpu_compute_opls_dihedral_forces_kernel<<< grid, threads>>>(d_force, d_virial, virial_pitch, N, d_pos, d_params, box, tlist, dihedral_ABCD, pitch, n_dihedrals_list); diff --git a/hoomd/md/OPLSDihedralForceGPU.cuh b/hoomd/md/OPLSDihedralForceGPU.cuh index bca16470b6..b466768019 100644 --- a/hoomd/md/OPLSDihedralForceGPU.cuh +++ b/hoomd/md/OPLSDihedralForceGPU.cuh @@ -28,7 +28,6 @@ cudaError_t gpu_compute_opls_dihedral_forces(Scalar4* d_force, const unsigned int *n_dihedrals_list, const Scalar4 *d_params, const unsigned int n_dihedral_types, - const int block_size, - const unsigned int compute_capability); + const int block_size); #endif diff --git a/hoomd/md/PPPMForceComputeGPU.cc b/hoomd/md/PPPMForceComputeGPU.cc index df6badcc2b..33b2b7922f 100644 --- a/hoomd/md/PPPMForceComputeGPU.cc +++ b/hoomd/md/PPPMForceComputeGPU.cc @@ -186,7 +186,7 @@ void PPPMForceComputeGPU::initializeFFT() cudaMemPrefetchAsync(m_fourier_mesh_G_z.get(), m_n_inner_cells*sizeof(cufftComplex), gpu_map[0]); cudaMemAdvise(m_fourier_mesh_G_z.get(), m_n_inner_cells*sizeof(cufftComplex), cudaMemAdviseSetPreferredLocation, gpu_map[0]); cudaMemPrefetchAsync(m_fourier_mesh_G_y.get(), m_n_inner_cells*sizeof(cufftComplex), gpu_map[0]); - } + } unsigned int n_blocks = (m_mesh_points.x*m_mesh_points.y*m_mesh_points.z)/m_block_size+1; GlobalArray sum_partial(n_blocks,m_exec_conf); @@ -686,8 +686,7 @@ void PPPMForceComputeGPU::fixExclusions() m_alpha, d_index_array.data, group_size, - m_block_size, - m_exec_conf->getComputeCapability()); + m_block_size); if(m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); diff --git a/hoomd/md/PPPMForceComputeGPU.cu b/hoomd/md/PPPMForceComputeGPU.cu index 06f6e22dfc..482383e6c2 100644 --- a/hoomd/md/PPPMForceComputeGPU.cu +++ b/hoomd/md/PPPMForceComputeGPU.cu @@ -16,23 +16,6 @@ //! The developer has chosen not to document this variable __device__ __constant__ Scalar GPU_rho_coeff[CONSTANT_SIZE]; -//! Implements workaround atomic float addition on sm_1x hardware -__device__ inline void atomicFloatAdd(float* address, float value) - { -#if (__CUDA_ARCH__ < 200) - float old = value; - float new_old; - do - { - new_old = atomicExch(address, 0.0f); - new_old += old; - } - while ((old = atomicExch(address, new_old))!=0.0f); -#else - atomicAdd(address, value); -#endif - } - //! GPU implementation of sinc(x)==sin(x)/x __device__ Scalar gpu_sinc(Scalar x) { @@ -257,7 +240,7 @@ __global__ void gpu_assign_particles_kernel(const uint3 mesh_dim, // compute fraction of particle density assigned to cell // from particles in this bin - atomicFloatAdd(&d_mesh[cell_idx].x, z0*result); + atomicAdd(&d_mesh[cell_idx].x, z0*result); } ignore_z = false; @@ -333,7 +316,7 @@ void gpu_assign_particles(const uint3 mesh_dim, if (ngpu > 1) { - // zero the temporary mesh array + // zero the temporary mesh array cudaMemsetAsync(d_mesh_scratch + idev*mesh_elements, 0, sizeof(cufftComplex)*mesh_elements); } @@ -369,7 +352,7 @@ void gpu_reduce_meshes(const unsigned int mesh_elements, d_mesh, ngpu); } - + __global__ void gpu_compute_mesh_virial_kernel(const unsigned int n_wave_vectors, cufftComplex *d_fourier_mesh, Scalar *d_inf_f, @@ -1230,13 +1213,6 @@ void gpu_compute_influence_function(const uint3 mesh_dim, #endif } -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; - -//! Texture for reading charge parameters -scalar_tex_t pdata_charge_tex; - - //! The developer has chosen not to document this function __global__ void gpu_fix_exclusions_kernel(Scalar4 *d_force, Scalar *d_virial, @@ -1259,10 +1235,10 @@ __global__ void gpu_fix_exclusions_kernel(Scalar4 *d_force, unsigned int idx = d_group_members[group_idx]; const Scalar sqrtpi = sqrtf(M_PI); unsigned int n_neigh = d_n_neigh[idx]; - Scalar4 postypei = texFetchScalar4(d_pos, pdata_pos_tex, idx); + Scalar4 postypei = __ldg(d_pos + idx); Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z); - Scalar qi = texFetchScalar(d_charge, pdata_charge_tex, idx); + Scalar qi = __ldg(d_charge + idx); // initialize the force to 0 Scalar4 force = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0)); Scalar virial[6]; @@ -1282,10 +1258,10 @@ __global__ void gpu_fix_exclusions_kernel(Scalar4 *d_force, next_j = d_nlist[nli(idx, neigh_idx+1)]; // get the neighbor's position (MEM TRANSFER: 16 bytes) - Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j); + Scalar4 postypej = __ldg(d_pos + cur_j); Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z); - Scalar qj = texFetchScalar(d_charge, pdata_charge_tex, cur_j); + Scalar qj = __ldg(d_charge + cur_j); // calculate dr (with periodic boundary conditions) (FLOPS: 3) Scalar3 dx = posi - posj; @@ -1348,25 +1324,12 @@ cudaError_t gpu_fix_exclusions(Scalar4 *d_force, Scalar alpha, unsigned int *d_group_members, unsigned int group_size, - int block_size, - const unsigned int compute_capability) + int block_size) { dim3 grid( group_size / block_size + 1, 1, 1); dim3 threads(block_size, 1, 1); - // bind the textures on pre sm35 arches - if (compute_capability < 350) - { - cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*Nmax); - if (error != cudaSuccess) - return error; - - error = cudaBindTexture(0, pdata_charge_tex, d_charge, sizeof(Scalar) * Nmax); - if (error != cudaSuccess) - return error; - } - - gpu_fix_exclusions_kernel <<< grid, threads >>> (d_force, + gpu_fix_exclusions_kernel <<< grid, threads >>> (d_force, d_virial, virial_pitch, d_pos, diff --git a/hoomd/md/PPPMForceComputeGPU.cuh b/hoomd/md/PPPMForceComputeGPU.cuh index e67f0c8fe0..79e5332931 100644 --- a/hoomd/md/PPPMForceComputeGPU.cuh +++ b/hoomd/md/PPPMForceComputeGPU.cuh @@ -30,7 +30,7 @@ void gpu_reduce_meshes(const unsigned int mesh_elements, cufftComplex *d_mesh, const unsigned int ngpu, const unsigned int block_size); - + void gpu_compute_mesh_virial(const unsigned int n_wave_vectors, cufftComplex *d_fourier_mesh, Scalar *d_inf_f, @@ -111,8 +111,7 @@ cudaError_t gpu_fix_exclusions(Scalar4 *d_force, Scalar alpha, unsigned int *d_group_members, unsigned int group_size, - int block_size, - const unsigned int compute_capability); + int block_size); void gpu_initialize_coeff( Scalar *CPU_rho_coeff, diff --git a/hoomd/md/PotentialBondGPU.cuh b/hoomd/md/PotentialBondGPU.cuh index 4871b43222..4b154e1e37 100644 --- a/hoomd/md/PotentialBondGPU.cuh +++ b/hoomd/md/PotentialBondGPU.cuh @@ -37,8 +37,7 @@ struct bond_args_t const Index2D & _gpu_table_indexer, const unsigned int *_d_gpu_n_bonds, const unsigned int _n_bond_types, - const unsigned int _block_size, - const unsigned int _compute_capability) + const unsigned int _block_size) : d_force(_d_force), d_virial(_d_virial), virial_pitch(_virial_pitch), @@ -52,8 +51,7 @@ struct bond_args_t gpu_table_indexer(_gpu_table_indexer), d_gpu_n_bonds(_d_gpu_n_bonds), n_bond_types(_n_bond_types), - block_size(_block_size), - compute_capability(_compute_capability) + block_size(_block_size) { }; @@ -71,18 +69,9 @@ struct bond_args_t const unsigned int *d_gpu_n_bonds; //!< List of number of bonds stored on the GPU const unsigned int n_bond_types; //!< Number of bond types in the simulation const unsigned int block_size; //!< Block size to execute - const unsigned int compute_capability; //!< Compute capability of the device }; #ifdef NVCC -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; - -//! Texture for reading particle diameters -scalar_tex_t pdata_diam_tex; - -//! Texture for reading particle charges -scalar_tex_t pdata_charge_tex; //! Kernel for calculating bond forces /*! This kernel is called to calculate the bond forces on all N particles. Actual evaluation of the potentials and @@ -150,14 +139,14 @@ __global__ void gpu_compute_bond_forces_kernel(Scalar4 *d_force, int n_bonds =n_bonds_list[idx]; // read in the position of our particle. (MEM TRANSFER: 16 bytes) - Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx); + Scalar4 postype = __ldg(d_pos + idx); Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z); // read in the diameter of our particle if needed Scalar diam(0); if (evaluator::needsDiameter()) { - diam = texFetchScalar(d_diameter, pdata_diam_tex, idx); + diam = __ldg(d_diameter + idx); } else diam += 0; // shut up compiler warning @@ -165,7 +154,7 @@ __global__ void gpu_compute_bond_forces_kernel(Scalar4 *d_force, Scalar q(0); if (evaluator::needsCharge()) { - q = texFetchScalar(d_charge, pdata_charge_tex, idx); + q = __ldg(d_charge + idx); } else q += 0; // shut up compiler warning @@ -186,7 +175,7 @@ __global__ void gpu_compute_bond_forces_kernel(Scalar4 *d_force, int cur_bond_type = cur_bond.idx[1]; // get the bonded particle's position (MEM_TRANSFER: 16 bytes) - Scalar4 neigh_postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_bond_idx); + Scalar4 neigh_postypej = __ldg(d_pos + cur_bond_idx); Scalar3 neigh_pos= make_scalar3(neigh_postypej.x, neigh_postypej.y, neigh_postypej.z); // calculate dr (FLOPS: 3) @@ -209,12 +198,12 @@ __global__ void gpu_compute_bond_forces_kernel(Scalar4 *d_force, // get the bonded particle's diameter if needed if (evaluator::needsDiameter()) { - Scalar neigh_diam = texFetchScalar(d_diameter, pdata_diam_tex, cur_bond_idx); + Scalar neigh_diam = __ldg(d_diameter + cur_bond_idx); eval.setDiameter(diam, neigh_diam); } if (evaluator::needsCharge()) { - Scalar neigh_q = texFetchScalar(d_charge, pdata_charge_tex, cur_bond_idx); + Scalar neigh_q = __ldg(d_charge + cur_bond_idx); eval.setCharge(q, neigh_q); } @@ -286,29 +275,6 @@ cudaError_t gpu_compute_bond_forces(const bond_args_t& bond_args, dim3 grid( bond_args.N / run_block_size + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the position texture on pre sm35 devices - if (bond_args.compute_capability < 350) - { - pdata_pos_tex.normalized = false; - pdata_pos_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, pdata_pos_tex, bond_args.d_pos, sizeof(Scalar4)*(bond_args.n_max)); - if (error != cudaSuccess) - return error; - - // bind the diameter texture - pdata_diam_tex.normalized = false; - pdata_diam_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, pdata_diam_tex, bond_args.d_diameter, sizeof(Scalar) *(bond_args.n_max)); - if (error != cudaSuccess) - return error; - - pdata_charge_tex.normalized = false; - pdata_charge_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, pdata_charge_tex, bond_args.d_charge, sizeof(Scalar) * (bond_args.n_max)); - if (error != cudaSuccess) - return error; - } - unsigned int shared_bytes = sizeof(typename evaluator::param_type) * bond_args.n_bond_types; diff --git a/hoomd/md/PotentialBondGPU.h b/hoomd/md/PotentialBondGPU.h index b1258171e2..06e0a7dac3 100644 --- a/hoomd/md/PotentialBondGPU.h +++ b/hoomd/md/PotentialBondGPU.h @@ -139,8 +139,7 @@ void PotentialBondGPU< evaluator, gpu_cgbf >::computeForces(unsigned int timeste gpu_table_indexer, d_gpu_n_bonds.data, this->m_bond_data->getNTypes(), - this->m_tuner->getParam(), - this->m_exec_conf->getComputeCapability()), + this->m_tuner->getParam()), d_params.data, d_flags.data); } diff --git a/hoomd/md/PotentialPairDPDThermoGPU.cuh b/hoomd/md/PotentialPairDPDThermoGPU.cuh index 2b4e6f304f..2818607b3e 100644 --- a/hoomd/md/PotentialPairDPDThermoGPU.cuh +++ b/hoomd/md/PotentialPairDPDThermoGPU.cuh @@ -49,9 +49,7 @@ struct dpd_pair_args_t const Scalar _T, const unsigned int _shift_mode, const unsigned int _compute_virial, - const unsigned int _threads_per_particle, - const unsigned int _compute_capability, - const unsigned int _max_tex1d_width) + const unsigned int _threads_per_particle) : d_force(_d_force), d_virial(_d_virial), virial_pitch(_virial_pitch), @@ -74,9 +72,7 @@ struct dpd_pair_args_t T(_T), shift_mode(_shift_mode), compute_virial(_compute_virial), - threads_per_particle(_threads_per_particle), - compute_capability(_compute_capability), - max_tex1d_width(_max_tex1d_width) + threads_per_particle(_threads_per_particle) { }; @@ -103,22 +99,9 @@ struct dpd_pair_args_t const unsigned int shift_mode; //!< The potential energy shift mode const unsigned int compute_virial; //!< Flag to indicate if virials should be computed const unsigned int threads_per_particle; //!< Number of threads per particle (maximum: 32==1 warp) - const unsigned int compute_capability; //!< Compute capability of the device (20, 30, 35, ...) - const unsigned int max_tex1d_width; //!< Maximum width of a 1d linear texture }; #ifdef NVCC -//! Texture for reading particle positions -scalar4_tex_t pdata_dpd_pos_tex; - -//! Texture for reading particle velocities -scalar4_tex_t pdata_dpd_vel_tex; - -//! Texture for reading particle tags -texture pdata_dpd_tag_tex; - -//! Texture for reading neighbor list -texture nlist_tex; //! Kernel for calculating pair forces /*! This kernel is called to calculate the pair forces on all N particles. Actual evaluation of the potentials and @@ -153,8 +136,6 @@ texture nlist_tex; \tparam evaluator EvaluatorPair class to evaluate V(r) and -delta V(r)/r \tparam shift_mode 0: No energy shifting is done. 1: V(r) is shifted to be 0 at rcut. \tparam compute_virial When non-zero, the virial tensor is computed. When zero, the virial tensor is not computed. - \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg - is used depending on architecture. Implementation details Each block will calculate the forces on a block of particles. @@ -222,26 +203,19 @@ __global__ void gpu_compute_dpd_forces_kernel(Scalar4 *d_force, // read in the position of our particle. // (MEM TRANSFER: 16 bytes) - Scalar4 postypei = texFetchScalar4(d_pos, pdata_dpd_pos_tex, idx); + Scalar4 postypei = __ldg(d_pos + idx); Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z); // read in the velocity of our particle. // (MEM TRANSFER: 16 bytes) - Scalar4 velmassi = texFetchScalar4(d_vel, pdata_dpd_vel_tex, idx); + Scalar4 velmassi = __ldg(d_vel + idx); Scalar3 veli = make_scalar3(velmassi.x, velmassi.y, velmassi.z); // prefetch neighbor index const unsigned int head_idx = d_head_list[idx]; unsigned int cur_j = 0; unsigned int next_j(0); - if (use_gmem_nlist) - { - next_j = (threadIdx.x%tpp < n_neigh) ? d_nlist[head_idx + threadIdx.x%tpp] : 0; - } - else - { - next_j = (threadIdx.x%tpp < n_neigh) ? texFetchUint(d_nlist, nlist_tex, head_idx + threadIdx.x%tpp) : 0; - } + next_j = (threadIdx.x%tpp < n_neigh) ? __ldg(d_nlist + head_idx + threadIdx.x%tpp) : 0; // this particle's tag unsigned int tagi = d_tag[idx]; @@ -255,22 +229,15 @@ __global__ void gpu_compute_dpd_forces_kernel(Scalar4 *d_force, cur_j = next_j; if (neigh_idx+tpp < n_neigh) { - if (use_gmem_nlist) - { - next_j = d_nlist[head_idx + neigh_idx + tpp]; - } - else - { - next_j = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + tpp); - } + next_j = __ldg(d_nlist + head_idx + neigh_idx + tpp); } // get the neighbor's position (MEM TRANSFER: 16 bytes) - Scalar4 postypej = texFetchScalar4(d_pos, pdata_dpd_pos_tex, cur_j); + Scalar4 postypej = __ldg(d_pos + cur_j); Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z); // get the neighbor's position (MEM TRANSFER: 16 bytes) - Scalar4 velmassj = texFetchScalar4(d_vel, pdata_dpd_vel_tex, cur_j); + Scalar4 velmassj = __ldg(d_vel + cur_j); Scalar3 velj = make_scalar3(velmassj.x, velmassj.y, velmassj.z); // calculate dr (with periodic boundary conditions) (FLOPS: 3) @@ -308,7 +275,7 @@ __global__ void gpu_compute_dpd_forces_kernel(Scalar4 *d_force, // Special Potential Pair DPD Requirements // use particle i's and j's tags - unsigned int tagj = texFetchUint(d_tag, pdata_dpd_tag_tex, cur_j); + unsigned int tagj = __ldg(d_tag + cur_j); eval.set_seed_ij_timestep(d_seed,tagi,tagj,d_timestep); eval.setDeltaT(d_deltaT); eval.setRDotV(rdotv); @@ -374,30 +341,6 @@ int dpd_get_max_block_size(T func) return max_threads; } -inline void gpu_dpd_pair_force_bind_textures(const dpd_pair_args_t pair_args) - { - // bind the position texture - pdata_dpd_pos_tex.normalized = false; - pdata_dpd_pos_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, pdata_dpd_pos_tex, pair_args.d_pos, sizeof(Scalar4)*pair_args.n_max); - - // bind the diameter texture - pdata_dpd_vel_tex.normalized = false; - pdata_dpd_vel_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, pdata_dpd_vel_tex, pair_args.d_vel, sizeof(Scalar4) * pair_args.n_max); - - pdata_dpd_tag_tex.normalized = false; - pdata_dpd_tag_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, pdata_dpd_tag_tex, pair_args.d_tag, sizeof(unsigned int) * pair_args.n_max); - - if (pair_args.size_nlist <= pair_args.max_tex1d_width) - { - nlist_tex.normalized = false; - nlist_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, nlist_tex, pair_args.d_nlist, sizeof(unsigned int) * pair_args.size_nlist); - } - } - //! DPD force compute kernel launcher /*! * \tparam evaluator EvaluatorPair class to evualuate V(r) and -delta V(r)/r @@ -434,8 +377,6 @@ struct DPDForceComputeKernel if (max_block_size == UINT_MAX) max_block_size = dpd_get_max_block_size(gpu_compute_dpd_forces_kernel); - if (args.compute_capability < 35) gpu_dpd_pair_force_bind_textures(args); - block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(args.N / (block_size/tpp) + 1, 1, 1); @@ -492,82 +433,40 @@ cudaError_t gpu_compute_dpd_forces(const dpd_pair_args_t& args, assert(args.ntypes > 0); // run the kernel - if (args.compute_capability < 35 && args.size_nlist > args.max_tex1d_width) + if (args.compute_virial) { - if (args.compute_virial) + switch (args.shift_mode) { - switch (args.shift_mode) + case 0: { - case 0: - { - DPDForceComputeKernel::launch(args, d_params); - break; - } - case 1: - { - DPDForceComputeKernel::launch(args, d_params); - break; - } - default: - return cudaErrorUnknown; + DPDForceComputeKernel::launch(args, d_params); + break; } - } - else - { - switch (args.shift_mode) + case 1: { - case 0: - { - DPDForceComputeKernel::launch(args, d_params); - break; - } - case 1: - { - DPDForceComputeKernel::launch(args, d_params); - break; - } - default: - return cudaErrorUnknown; + DPDForceComputeKernel::launch(args, d_params); + break; } + default: + return cudaErrorUnknown; } } else { - if (args.compute_virial) + switch (args.shift_mode) { - switch (args.shift_mode) + case 0: { - case 0: - { - DPDForceComputeKernel::launch(args, d_params); - break; - } - case 1: - { - DPDForceComputeKernel::launch(args, d_params); - break; - } - default: - return cudaErrorUnknown; + DPDForceComputeKernel::launch(args, d_params); + break; } - } - else - { - switch (args.shift_mode) + case 1: { - case 0: - { - DPDForceComputeKernel::launch(args, d_params); - break; - } - case 1: - { - DPDForceComputeKernel::launch(args, d_params); - break; - } - default: - return cudaErrorUnknown; + DPDForceComputeKernel::launch(args, d_params); + break; } + default: + return cudaErrorUnknown; } } diff --git a/hoomd/md/PotentialPairDPDThermoGPU.h b/hoomd/md/PotentialPairDPDThermoGPU.h index dcba7d7d70..fab74fed83 100644 --- a/hoomd/md/PotentialPairDPDThermoGPU.h +++ b/hoomd/md/PotentialPairDPDThermoGPU.h @@ -179,9 +179,7 @@ void PotentialPairDPDThermoGPU< evaluator, gpu_cpdf >::computeForces(unsigned in this->m_T->getValue(timestep), this->m_shift_mode, flags[pdata_flag::pressure_tensor] || flags[pdata_flag::isotropic_virial], - threads_per_particle, - this->m_exec_conf->getComputeCapability()/10, - this->m_exec_conf->dev_prop.maxTexture1DLinear), + threads_per_particle), d_params.data); if (this->m_exec_conf->isCUDAErrorCheckingEnabled()) diff --git a/hoomd/md/PotentialPairGPU.cuh b/hoomd/md/PotentialPairGPU.cuh index ab9c1c2e45..357c9b127d 100644 --- a/hoomd/md/PotentialPairGPU.cuh +++ b/hoomd/md/PotentialPairGPU.cuh @@ -53,8 +53,6 @@ struct pair_args_t const unsigned int _shift_mode, const unsigned int _compute_virial, const unsigned int _threads_per_particle, - const unsigned int _compute_capability, - const unsigned int _max_tex1d_width, const GPUPartition& _gpu_partition) : d_force(_d_force), d_virial(_d_virial), @@ -76,8 +74,6 @@ struct pair_args_t shift_mode(_shift_mode), compute_virial(_compute_virial), threads_per_particle(_threads_per_particle), - compute_capability(_compute_capability), - max_tex1d_width(_max_tex1d_width), gpu_partition(_gpu_partition) { }; @@ -102,25 +98,10 @@ struct pair_args_t const unsigned int shift_mode; //!< The potential energy shift mode const unsigned int compute_virial; //!< Flag to indicate if virials should be computed const unsigned int threads_per_particle; //!< Number of threads per particle (maximum: 1 warp) - const unsigned int compute_capability; //!< Compute capability (20 30 35, ...) - const unsigned int max_tex1d_width; //!< Maximum width of a linear 1D texture const GPUPartition& gpu_partition; //!< The load balancing partition of particles between GPUs }; #ifdef NVCC -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; - -//! Texture for reading particle diameters -scalar_tex_t pdata_diam_tex; - -//! Texture for reading particle charges -scalar_tex_t pdata_charge_tex; - -// there is some naming conflict between the DPD pair force and PotentialPair because -// the DPD does not extend PotentialPair, and so we need to choose a different name for this texture -//! Texture for reading neighbor list -texture pair_nlist_tex; //! Kernel for calculating pair forces /*! This kernel is called to calculate the pair forces on all N particles. Actual evaluation of the potentials and @@ -153,8 +134,6 @@ texture pair_nlist_tex; \tparam shift_mode 0: No energy shifting is done. 1: V(r) is shifted to be 0 at rcut. 2: XPLOR switching is enabled (See PotentialPair for a discussion on what that entails) \tparam compute_virial When non-zero, the virial tensor is computed. When zero, the virial tensor is not computed. - \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg - is used depending on architecture. \tparam tpp Number of threads to use per particle, must be power of 2 and smaller than warp size Implementation details @@ -162,7 +141,7 @@ texture pair_nlist_tex; Each group of \a tpp threads will calculate the total force on one particle. The neighborlist is arranged in columns so that reads are fully coalesced when doing this. */ -template< class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist, int tpp> +template< class evaluator, unsigned int shift_mode, unsigned int compute_virial, int tpp> __global__ void gpu_compute_pair_forces_shared_kernel(Scalar4 *d_force, Scalar *d_virial, const unsigned int virial_pitch, @@ -231,17 +210,17 @@ __global__ void gpu_compute_pair_forces_shared_kernel(Scalar4 *d_force, // read in the position of our particle. // (MEM TRANSFER: 16 bytes) - Scalar4 postypei = texFetchScalar4(d_pos, pdata_pos_tex, idx); + Scalar4 postypei = __ldg(d_pos + idx); Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z); Scalar di; if (evaluator::needsDiameter()) - di = texFetchScalar(d_diameter, pdata_diam_tex, idx); + di = __ldg(d_diameter + idx); else di += Scalar(1.0); // shut up compiler warning Scalar qi; if (evaluator::needsCharge()) - qi = texFetchScalar(d_charge, pdata_charge_tex, idx); + qi = __ldg(d_charge + idx); else qi += Scalar(1.0); // shut up compiler warning @@ -249,14 +228,7 @@ __global__ void gpu_compute_pair_forces_shared_kernel(Scalar4 *d_force, unsigned int cur_j = 0; unsigned int next_j(0); - if (use_gmem_nlist) - { - next_j = (threadIdx.x%tpp < n_neigh) ? d_nlist[my_head + threadIdx.x%tpp] : 0; - } - else - { - next_j = threadIdx.x%tpp < n_neigh ? texFetchUint(d_nlist, pair_nlist_tex, my_head + threadIdx.x%tpp) : 0; - } + next_j = threadIdx.x%tpp < n_neigh ? __ldg(d_nlist + my_head + threadIdx.x%tpp) : 0; // loop over neighbors for (int neigh_idx = threadIdx.x%tpp; neigh_idx < n_neigh; neigh_idx+=tpp) @@ -266,28 +238,21 @@ __global__ void gpu_compute_pair_forces_shared_kernel(Scalar4 *d_force, cur_j = next_j; if (neigh_idx+tpp < n_neigh) { - if (use_gmem_nlist) - { - next_j = d_nlist[my_head + neigh_idx + tpp]; - } - else - { - next_j = texFetchUint(d_nlist, pair_nlist_tex, my_head + neigh_idx+tpp); - } + next_j = __ldg(d_nlist + my_head + neigh_idx+tpp); } // get the neighbor's position (MEM TRANSFER: 16 bytes) - Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j); + Scalar4 postypej = __ldg(d_pos + cur_j); Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z); Scalar dj = Scalar(0.0); if (evaluator::needsDiameter()) - dj = texFetchScalar(d_diameter, pdata_diam_tex, cur_j); + dj = __ldg(d_diameter + cur_j); else dj += Scalar(1.0); // shut up compiler warning Scalar qj = Scalar(0.0); if (evaluator::needsCharge()) - qj = texFetchScalar(d_charge, pdata_charge_tex, cur_j); + qj = __ldg(d_charge + cur_j); else qj += Scalar(1.0); // shut up compiler warning @@ -423,57 +388,18 @@ int get_max_block_size(T func) return max_threads; } -inline void gpu_pair_force_bind_textures(const pair_args_t pair_args) - { - // bind the position texture - pdata_pos_tex.normalized = false; - pdata_pos_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, pdata_pos_tex, pair_args.d_pos, sizeof(Scalar4)*pair_args.n_max); - - // bind the diameter texture - pdata_diam_tex.normalized = false; - pdata_diam_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, pdata_diam_tex, pair_args.d_diameter, sizeof(Scalar) * pair_args.n_max); - - pdata_charge_tex.normalized = false; - pdata_charge_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, pdata_charge_tex, pair_args.d_charge, sizeof(Scalar) * pair_args.n_max); - - // bind the neighborlist texture if it will fit - if (pair_args.size_neigh_list <= pair_args.max_tex1d_width) - { - pair_nlist_tex.normalized = false; - pair_nlist_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, pair_nlist_tex, pair_args.d_nlist, sizeof(unsigned int) * pair_args.size_neigh_list); - } - } - -inline void gpu_pair_force_unbind_textures(const pair_args_t pair_args) - { - cudaUnbindTexture(pdata_pos_tex); - cudaUnbindTexture(pdata_diam_tex); - cudaUnbindTexture(pdata_charge_tex); - - if (pair_args.size_neigh_list <= pair_args.max_tex1d_width) - { - cudaUnbindTexture(pair_nlist_tex); - } - } - //! Pair force compute kernel launcher /*! * \tparam evaluator EvaluatorPair class to evaluate V(r) and -delta V(r)/r * \tparam shift_mode 0: No energy shifting is done. 1: V(r) is shifted to be 0 at rcut. 2: XPLOR switching is enabled * (See PotentialPair for a discussion on what that entails) * \tparam compute_virial When non-zero, the virial tensor is computed. When zero, the virial tensor is not computed. - * \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg - * is used depending on architecture. * \tparam tpp Number of threads to use per particle, must be power of 2 and smaller than warp size * * Partial function template specialization is not allowed in C++, so instead we have to wrap this with a struct that * we are allowed to partially specialize. */ -template +template struct PairForceComputeKernel { //! Launcher for the pair force kernel @@ -501,31 +427,27 @@ struct PairForceComputeKernel static unsigned int max_block_size = UINT_MAX; if (max_block_size == UINT_MAX) - max_block_size = get_max_block_size(gpu_compute_pair_forces_shared_kernel); - - if (pair_args.compute_capability < 35) gpu_pair_force_bind_textures(pair_args); + max_block_size = get_max_block_size(gpu_compute_pair_forces_shared_kernel); block_size = block_size < max_block_size ? block_size : max_block_size; dim3 grid(N / (block_size/tpp) + 1, 1, 1); - gpu_compute_pair_forces_shared_kernel + gpu_compute_pair_forces_shared_kernel <<>>(pair_args.d_force, pair_args.d_virial, pair_args.virial_pitch, N, pair_args.d_pos, pair_args.d_diameter, pair_args.d_charge, pair_args.box, pair_args.d_n_neigh, pair_args.d_nlist, pair_args.d_head_list, d_params, pair_args.d_rcutsq, pair_args.d_ronsq, pair_args.ntypes, offset); - - if (pair_args.compute_capability < 35) gpu_pair_force_unbind_textures(pair_args); } else { - PairForceComputeKernel::launch(pair_args, range, d_params); + PairForceComputeKernel::launch(pair_args, range, d_params); } } }; //! Template specialization to do nothing for the tpp = 0 case -template -struct PairForceComputeKernel +template +struct PairForceComputeKernel { static void launch(const pair_args_t& pair_args, std::pair range, const typename evaluator::param_type *d_params) { @@ -554,102 +476,50 @@ cudaError_t gpu_compute_pair_forces(const pair_args_t& pair_args, auto range = pair_args.gpu_partition.getRangeAndSetGPU(idev); // Launch kernel - if (pair_args.compute_capability < 35 && pair_args.size_neigh_list > pair_args.max_tex1d_width) - { // fall back to slow global loads when the neighbor list is too big for texture memory - if (pair_args.compute_virial) + if (pair_args.compute_virial) + { + switch (pair_args.shift_mode) { - switch (pair_args.shift_mode) + case 0: { - case 0: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - case 1: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - case 2: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - default: - break; + PairForceComputeKernel::launch(pair_args, range, d_params); + break; } - } - else - { - switch (pair_args.shift_mode) + case 1: { - case 0: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - case 1: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - case 2: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - default: - break; + PairForceComputeKernel::launch(pair_args, range, d_params); + break; } + case 2: + { + PairForceComputeKernel::launch(pair_args, range, d_params); + break; + } + default: + break; } } else { - if (pair_args.compute_virial) + switch (pair_args.shift_mode) { - switch (pair_args.shift_mode) + case 0: { - case 0: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - case 1: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - case 2: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - default: - break; + PairForceComputeKernel::launch(pair_args, range, d_params); + break; } - } - else - { - switch (pair_args.shift_mode) + case 1: { - case 0: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - case 1: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - case 2: - { - PairForceComputeKernel::launch(pair_args, range, d_params); - break; - } - default: - break; + PairForceComputeKernel::launch(pair_args, range, d_params); + break; + } + case 2: + { + PairForceComputeKernel::launch(pair_args, range, d_params); + break; } + default: + break; } } } diff --git a/hoomd/md/PotentialPairGPU.h b/hoomd/md/PotentialPairGPU.h index 57cd3773ab..519a825e4e 100644 --- a/hoomd/md/PotentialPairGPU.h +++ b/hoomd/md/PotentialPairGPU.h @@ -188,8 +188,6 @@ void PotentialPairGPU< evaluator, gpu_cgpf >::computeForces(unsigned int timeste this->m_shift_mode, flags[pdata_flag::pressure_tensor] || flags[pdata_flag::isotropic_virial], threads_per_particle, - this->m_exec_conf->getComputeCapability()/10, - this->m_exec_conf->dev_prop.maxTexture1DLinear, this->m_pdata->getGPUPartition()), d_params.data); diff --git a/hoomd/md/PotentialSpecialPairGPU.h b/hoomd/md/PotentialSpecialPairGPU.h index 75f1c03b94..fddc6012a2 100644 --- a/hoomd/md/PotentialSpecialPairGPU.h +++ b/hoomd/md/PotentialSpecialPairGPU.h @@ -140,8 +140,7 @@ void PotentialSpecialPairGPU< evaluator, gpu_cgbf >::computeForces(unsigned int gpu_table_indexer, d_gpu_n_bonds.data, this->m_pair_data->getNTypes(), - this->m_tuner->getParam(), - this->m_exec_conf->getComputeCapability()), + this->m_tuner->getParam()), d_params.data, d_flags.data); } diff --git a/hoomd/md/PotentialTersoffGPU.cuh b/hoomd/md/PotentialTersoffGPU.cuh index 5af4067b16..f86dd68e67 100644 --- a/hoomd/md/PotentialTersoffGPU.cuh +++ b/hoomd/md/PotentialTersoffGPU.cuh @@ -42,7 +42,6 @@ struct tersoff_args_t const unsigned int _ntypes, const unsigned int _block_size, const unsigned int _tpp, - const unsigned int _compute_capability, const cudaDeviceProp& _devprop) : d_force(_d_force), N(_N), @@ -61,7 +60,6 @@ struct tersoff_args_t ntypes(_ntypes), block_size(_block_size), tpp(_tpp), - compute_capability(_compute_capability), devprop(_devprop) { }; @@ -83,17 +81,11 @@ struct tersoff_args_t const unsigned int ntypes; //!< Number of particle types in the simulation const unsigned int block_size; //!< Block size to execute const unsigned int tpp; //!< Threads per particle - const unsigned int compute_capability; //!< GPU compute capability (20, 30, 35, ...) const cudaDeviceProp& devprop; //!< CUDA device properties }; #ifdef NVCC -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; - -//! Texture for reading neighbor list -texture nlist_tex; #if !defined(SINGLE_PRECISION) @@ -154,15 +146,13 @@ __device__ float myAtomicAdd(float* address, float val) Certain options are controlled via template parameters to avoid the performance hit when they are not enabled. \tparam evaluator EvaluatorPair class to evaluate V(r) and -delta V(r)/r - \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg - is used depending on architecture. Implementation details Each block will calculate the forces on a block of particles. Each thread will calculate the total force on one particle. The neighborlist is arranged in columns so that reads are fully coalesced when doing this. */ -template< class evaluator , unsigned char use_gmem_nlist, unsigned char compute_virial, int tpp> +template< class evaluator, unsigned char compute_virial, int tpp> __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force, const unsigned int N, Scalar *d_virial, @@ -216,7 +206,7 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force, unsigned int n_neigh = d_n_neigh[idx]; // read in the position of the particle - Scalar4 postypei = texFetchScalar4(d_pos, pdata_pos_tex, idx); + Scalar4 postypei = __ldg(d_pos + idx); Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z); // initialize the force to 0 @@ -238,14 +228,7 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force, unsigned int next_j(0); unsigned int my_head = d_head_list[idx]; - if (use_gmem_nlist) - { - next_j = (threadIdx.x%tpp < n_neigh) ? d_nlist[my_head + threadIdx.x%tpp] : 0; - } - else - { - next_j = threadIdx.x%tpp < n_neigh ? texFetchUint(d_nlist, nlist_tex, my_head + threadIdx.x%tpp) : 0; - } + next_j = threadIdx.x%tpp < n_neigh ? __ldg(d_nlist + my_head + threadIdx.x%tpp) : 0; // loop over neighbors in strided way for (int neigh_idx = threadIdx.x%tpp; neigh_idx < n_neigh; neigh_idx+=tpp) @@ -255,18 +238,11 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force, cur_j = next_j; if (neigh_idx+tpp < n_neigh) { - if (use_gmem_nlist) - { - next_j = d_nlist[head_idx + neigh_idx + tpp]; - } - else - { - next_j = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + tpp); - } + next_j = __ldg(d_nlist + head_idx + neigh_idx + tpp); } // read the position of j (MEM TRANSFER: 16 bytes) - Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j); + Scalar4 postypej = __ldg(d_pos + cur_j); Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z); // initialize the force on j @@ -295,13 +271,11 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force, { Scalar phi = s_phi_ab[threadIdx.x*ntypes+typ_b]; - #if (__CUDA_ARCH__ >= 300) // reduce in warp phi = hoomd::detail::WarpReduce().Sum(phi); // broadcast into shared mem s_phi_ab[threadIdx.x*ntypes+typ_b] = hoomd::detail::WarpScan().Broadcast(phi, 0); - #endif if (threadIdx.x % tpp == 0) { @@ -324,14 +298,7 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force, unsigned int next_j(0); unsigned int my_head = d_head_list[idx]; - if (use_gmem_nlist) - { - next_j = (threadIdx.x%tpp < n_neigh) ? d_nlist[my_head + threadIdx.x%tpp] : 0; - } - else - { - next_j = threadIdx.x%tpp < n_neigh ? texFetchUint(d_nlist, nlist_tex, my_head + threadIdx.x%tpp) : 0; - } + next_j = threadIdx.x%tpp < n_neigh ? __ldg(d_nlist + my_head + threadIdx.x%tpp) : 0; // loop over neighbors in strided way for (int neigh_idx = threadIdx.x%tpp; neigh_idx < n_neigh; neigh_idx+=tpp) @@ -341,18 +308,11 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force, cur_j = next_j; if (neigh_idx+tpp < n_neigh) { - if (use_gmem_nlist) - { - next_j = d_nlist[head_idx + neigh_idx + tpp]; - } - else - { - next_j = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + tpp); - } + next_j = __ldg(d_nlist + head_idx + neigh_idx + tpp); } // read the position of j (MEM TRANSFER: 16 bytes) - Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j); + Scalar4 postypej = __ldg(d_pos + cur_j); Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z); // initialize the force on j @@ -394,31 +354,17 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force, // compute chi unsigned int cur_k = 0; unsigned int next_k(0); - if (use_gmem_nlist) - { - next_k = d_nlist[head_idx]; - } - else - { - next_k = texFetchUint(d_nlist, nlist_tex, head_idx); - } + next_k = __ldg(d_nlist + head_idx); // loop over neighbors one by one for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++) { // read the current index of k and prefetch the next one cur_k = next_k; - if (use_gmem_nlist) - { - next_k = d_nlist[head_idx + neigh_idy + 1]; - } - else - { - next_k = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idy+1); - } + next_k = __ldg(d_nlist + head_idx + neigh_idy+1); // get the position of neighbor k - Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k); + Scalar4 postypek = __ldg(d_pos + cur_k); Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z); // get the type pair parameters for i and k @@ -506,31 +452,17 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force, // now evaluate the force from the ik interactions unsigned int cur_k = 0; unsigned int next_k(0); - if (use_gmem_nlist) - { - next_k = d_nlist[head_idx]; - } - else - { - next_k = texFetchUint(d_nlist, nlist_tex, head_idx); - } + next_k = __ldg(d_nlist + head_idx); // loop over neighbors one by one for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++) { // read the current neighbor index and prefetch the next one cur_k = next_k; - if (use_gmem_nlist) - { - next_k = d_nlist[head_idx + neigh_idy + 1]; - } - else - { - next_k = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idy+1); - } + next_k = __ldg(d_nlist + head_idx + neigh_idy+1); // get the position of neighbor k - Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k); + Scalar4 postypek = __ldg(d_pos + cur_k); Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z); // get the type pair parameters for i and k @@ -716,7 +648,7 @@ void get_max_block_size(T func, const tersoff_args_t& pair_args, unsigned int& m * Partial function template specialization is not allowed in C++, so instead we have to wrap this with a struct that * we are allowed to partially specialize. */ -template +template struct TersoffComputeKernel { //! Launcher for the tersoff triplet kernel @@ -731,24 +663,9 @@ struct TersoffComputeKernel static unsigned int max_block_size = UINT_MAX; static unsigned int kernel_shared_bytes = 0; if (max_block_size == UINT_MAX) - get_max_block_size(gpu_compute_triplet_forces_kernel, pair_args, max_block_size, kernel_shared_bytes); + get_max_block_size(gpu_compute_triplet_forces_kernel, pair_args, max_block_size, kernel_shared_bytes); int run_block_size = min(pair_args.block_size, max_block_size); - // bind to texture - if (pair_args.compute_capability < 35) - { - pdata_pos_tex.normalized = false; - pdata_pos_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, pdata_pos_tex, pair_args.d_pos, sizeof(Scalar4) * (pair_args.N+pair_args.Nghosts)); - - if (pair_args.size_nlist <= (unsigned int) pair_args.devprop.maxTexture1DLinear) - { - nlist_tex.normalized = false; - nlist_tex.filterMode = cudaFilterModePoint; - cudaBindTexture(0, nlist_tex, pair_args.d_nlist, sizeof(unsigned int) * pair_args.size_nlist); - } - } - // size shared bytes Index2D typpair_idx(pair_args.ntypes); unsigned int shared_bytes = (sizeof(Scalar) + sizeof(typename evaluator::param_type)) @@ -772,7 +689,7 @@ struct TersoffComputeKernel dim3 grid( pair_args.N / (run_block_size/pair_args.tpp) + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - gpu_compute_triplet_forces_kernel + gpu_compute_triplet_forces_kernel <<>>(pair_args.d_force, pair_args.N, pair_args.d_virial, @@ -789,14 +706,14 @@ struct TersoffComputeKernel } else { - TersoffComputeKernel::launch(pair_args, d_params); + TersoffComputeKernel::launch(pair_args, d_params); } } }; //! Template specialization to do nothing for the tpp = 0 case -template -struct TersoffComputeKernel +template +struct TersoffComputeKernel { static void launch(const tersoff_args_t& pair_args, const typename evaluator::param_type *d_params) { @@ -822,25 +739,11 @@ cudaError_t gpu_compute_triplet_forces(const tersoff_args_t& pair_args, // compute the new forces if (!pair_args.compute_virial) { - if (pair_args.compute_capability < 35 && pair_args.size_nlist > (unsigned int) pair_args.devprop.maxTexture1DLinear) - { - TersoffComputeKernel::launch(pair_args, d_params); - } - else - { - TersoffComputeKernel::launch(pair_args, d_params); - } + TersoffComputeKernel::launch(pair_args, d_params); } else { - if (pair_args.compute_capability < 35 && pair_args.size_nlist > (unsigned int) pair_args.devprop.maxTexture1DLinear) - { - TersoffComputeKernel::launch(pair_args, d_params); - } - else - { - TersoffComputeKernel::launch(pair_args, d_params); - } + TersoffComputeKernel::launch(pair_args, d_params); } return cudaSuccess; } diff --git a/hoomd/md/PotentialTersoffGPU.h b/hoomd/md/PotentialTersoffGPU.h index f9fd2eab5d..74e753a779 100644 --- a/hoomd/md/PotentialTersoffGPU.h +++ b/hoomd/md/PotentialTersoffGPU.h @@ -169,7 +169,6 @@ void PotentialTersoffGPU< evaluator, gpu_cgpf >::computeForces(unsigned int time this->m_pdata->getNTypes(), block_size, threads_per_particle, - this->m_exec_conf->getComputeCapability()/10, this->m_exec_conf->dev_prop), d_params.data); diff --git a/hoomd/md/TableAngleForceComputeGPU.cc b/hoomd/md/TableAngleForceComputeGPU.cc index c9f544f72a..617c369a06 100644 --- a/hoomd/md/TableAngleForceComputeGPU.cc +++ b/hoomd/md/TableAngleForceComputeGPU.cc @@ -82,8 +82,7 @@ void TableAngleForceComputeGPU::computeForces(unsigned int timestep) d_tables.data, m_table_width, m_table_value, - m_tuner->getParam(), - m_exec_conf->getComputeCapability()); + m_tuner->getParam()); } diff --git a/hoomd/md/TableAngleForceGPU.cu b/hoomd/md/TableAngleForceGPU.cu index 131010d9f4..b03fb9db55 100644 --- a/hoomd/md/TableAngleForceGPU.cu +++ b/hoomd/md/TableAngleForceGPU.cu @@ -16,10 +16,6 @@ \brief Defines GPU kernel code for calculating the table angle forces. Used by TableAngleForceComputeGPU. */ - -//! Texture for reading table values -scalar2_tex_t tables_tex; - /*! This kernel is called to calculate the table angle forces on all triples this is defined or \param d_force Device memory to write computed forces @@ -38,10 +34,6 @@ scalar2_tex_t tables_tex; \param delta_th angle delta of the table See TableAngleForceCompute for information on the memory layout. - - \b Details: - * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests - at a later date may result in this changing. */ __global__ void gpu_compute_table_angle_forces_kernel(Scalar4* d_force, Scalar* d_virial, @@ -153,8 +145,8 @@ __global__ void gpu_compute_table_angle_forces_kernel(Scalar4* d_force, // compute index into the table and read in values unsigned int value_i = value_f; - Scalar2 VT0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_angle_type)); - Scalar2 VT1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_angle_type)); + Scalar2 VT0 = __ldg(d_tables + table_value(value_i, cur_angle_type)); + Scalar2 VT1 = __ldg(d_tables + table_value(value_i+1, cur_angle_type)); // unpack the data Scalar V0 = VT0.x; Scalar V1 = VT1.x; @@ -257,8 +249,7 @@ cudaError_t gpu_compute_table_angle_forces(Scalar4* d_force, const Scalar2 *d_tables, const unsigned int table_width, const Index2D &table_value, - const unsigned int block_size, - const unsigned int compute_capability) + const unsigned int block_size) { assert(d_tables); assert(table_width > 1); @@ -280,16 +271,6 @@ cudaError_t gpu_compute_table_angle_forces(Scalar4* d_force, dim3 grid( N / run_block_size + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the tables texture on pre sm35 arches - if (compute_capability < 350) - { - tables_tex.normalized = false; - tables_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_value.getNumElements()); - if (error != cudaSuccess) - return error; - } - Scalar delta_th = Scalar(M_PI)/(Scalar)(table_width - 1); gpu_compute_table_angle_forces_kernel<<< grid, threads >>> diff --git a/hoomd/md/TableAngleForceGPU.cuh b/hoomd/md/TableAngleForceGPU.cuh index f75cda7a1c..8f107e3953 100644 --- a/hoomd/md/TableAngleForceGPU.cuh +++ b/hoomd/md/TableAngleForceGPU.cuh @@ -30,7 +30,6 @@ cudaError_t gpu_compute_table_angle_forces(Scalar4* d_force, const Scalar2 *d_tables, const unsigned int table_width, const Index2D &table_value, - const unsigned int block_size, - const unsigned int compute_capability); + const unsigned int block_size); #endif diff --git a/hoomd/md/TableDihedralForceComputeGPU.cc b/hoomd/md/TableDihedralForceComputeGPU.cc index c0c04b5479..0b18218414 100644 --- a/hoomd/md/TableDihedralForceComputeGPU.cc +++ b/hoomd/md/TableDihedralForceComputeGPU.cc @@ -83,8 +83,7 @@ void TableDihedralForceComputeGPU::computeForces(unsigned int timestep) d_tables.data, m_table_width, m_table_value, - m_tuner->getParam(), - m_exec_conf->getComputeCapability()); + m_tuner->getParam()); } diff --git a/hoomd/md/TableDihedralForceGPU.cu b/hoomd/md/TableDihedralForceGPU.cu index 90a8356bd6..f121371eaf 100644 --- a/hoomd/md/TableDihedralForceGPU.cu +++ b/hoomd/md/TableDihedralForceGPU.cu @@ -18,10 +18,6 @@ \brief Defines GPU kernel code for calculating the table dihedral forces. Used by TableDihedralForceComputeGPU. */ - -//! Texture for reading table values -scalar2_tex_t tables_tex; - /*! This kernel is called to calculate the table dihedral forces on all triples this is defined or \param d_force Device memory to write computed forces @@ -39,10 +35,6 @@ scalar2_tex_t tables_tex; \param delta_phi dihedral delta of the table See TableDihedralForceCompute for information on the memory layout. - - \b Details: - * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests - at a later date may result in this changing. */ __global__ void gpu_compute_table_dihedral_forces_kernel(Scalar4* d_force, Scalar* d_virial, @@ -206,8 +198,8 @@ __global__ void gpu_compute_table_dihedral_forces_kernel(Scalar4* d_force, // compute index into the table and read in values unsigned int value_i = value_f; - Scalar2 VT0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_dihedral_type)); - Scalar2 VT1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_dihedral_type)); + Scalar2 VT0 = __ldg(d_tables + table_value(value_i, cur_dihedral_type)); + Scalar2 VT1 = __ldg(d_tables + table_value(value_i+1, cur_dihedral_type)); // unpack the data Scalar V0 = VT0.x; Scalar V1 = VT1.x; @@ -318,8 +310,7 @@ cudaError_t gpu_compute_table_dihedral_forces(Scalar4* d_force, const Scalar2 *d_tables, const unsigned int table_width, const Index2D &table_value, - const unsigned int block_size, - const unsigned int compute_capability) + const unsigned int block_size) { assert(d_tables); assert(table_width > 1); @@ -341,16 +332,6 @@ cudaError_t gpu_compute_table_dihedral_forces(Scalar4* d_force, dim3 grid( N / run_block_size + 1, 1, 1); dim3 threads(run_block_size, 1, 1); - // bind the tables texture on pre sm35 devices - if (compute_capability < 350) - { - tables_tex.normalized = false; - tables_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_value.getNumElements()); - if (error != cudaSuccess) - return error; - } - Scalar delta_phi = Scalar(2.0*M_PI)/(Scalar)(table_width - 1); gpu_compute_table_dihedral_forces_kernel<<< grid, threads>>> diff --git a/hoomd/md/TableDihedralForceGPU.cuh b/hoomd/md/TableDihedralForceGPU.cuh index 8bf50161e7..bcab1e48db 100644 --- a/hoomd/md/TableDihedralForceGPU.cuh +++ b/hoomd/md/TableDihedralForceGPU.cuh @@ -30,7 +30,6 @@ cudaError_t gpu_compute_table_dihedral_forces(Scalar4* d_force, const Scalar2 *d_tables, const unsigned int table_width, const Index2D &table_value, - const unsigned int block_size, - const unsigned int compute_capability); + const unsigned int block_size); #endif diff --git a/hoomd/md/TablePotentialGPU.cc b/hoomd/md/TablePotentialGPU.cc index 459b8bd8bc..4da517e897 100644 --- a/hoomd/md/TablePotentialGPU.cc +++ b/hoomd/md/TablePotentialGPU.cc @@ -96,8 +96,6 @@ void TablePotentialGPU::computeForces(unsigned int timestep) m_ntypes, m_table_width, m_tuner->getParam(), - m_exec_conf->getComputeCapability(), - m_exec_conf->dev_prop.maxTexture1DLinear, m_pdata->getGPUPartition()); if(m_exec_conf->isCUDAErrorCheckingEnabled()) diff --git a/hoomd/md/TablePotentialGPU.cu b/hoomd/md/TablePotentialGPU.cu index 75dbed0e81..33bc2697a3 100644 --- a/hoomd/md/TablePotentialGPU.cu +++ b/hoomd/md/TablePotentialGPU.cu @@ -15,15 +15,6 @@ \brief Defines GPU kernel code for calculating the table pair forces. Used by TablePotentialGPU. */ -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; - -//! Texture for reading the neighborlist -texture nlist_tex; - -//! Texture for reading table values -scalar2_tex_t tables_tex; - /*! This kernel is called to calculate the table pair forces on all N particles \param d_force Device memory to write computed forces @@ -44,12 +35,7 @@ scalar2_tex_t tables_tex; \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg is used depending on architecture. - - \b Details: - * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests - at a later date may result in this changing. */ -template __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force, Scalar* d_virial, const unsigned virial_pitch, @@ -92,7 +78,7 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force, const unsigned int head_idx = d_head_list[idx]; // read in the position of our particle. Texture reads of Scalar4's are faster than global reads on compute 1.0 hardware - Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx); + Scalar4 postype = __ldg(d_pos + idx); Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z); unsigned int typei = __scalar_as_int(postype.w); @@ -108,14 +94,7 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force, // prefetch neighbor index unsigned int cur_neigh = 0; unsigned int next_neigh(0); - if (use_gmem_nlist) - { - next_neigh = d_nlist[head_idx]; - } - else - { - next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx); - } + next_neigh = __ldg(d_nlist + head_idx); // loop over neighbors for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++) @@ -123,17 +102,10 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force, // read the current neighbor index // prefetch the next value and set the current one cur_neigh = next_neigh; - if (use_gmem_nlist) - { - next_neigh = d_nlist[head_idx + neigh_idx + 1]; - } - else - { - next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx+1); - } + next_neigh = __ldg(d_nlist + head_idx + neigh_idx+1); // get the neighbor's position - Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh); + Scalar4 neigh_postype = __ldg(d_pos + cur_neigh); Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z); // calculate dr (with periodic boundary conditions) @@ -161,8 +133,8 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force, // compute index into the table and read in values unsigned int value_i = floor(value_f); - Scalar2 VF0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_table_index)); - Scalar2 VF1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_table_index)); + Scalar2 VF0 = __ldg(d_tables + table_value(value_i, cur_table_index)); + Scalar2 VF1 = __ldg(d_tables + table_value(value_i+1, cur_table_index)); // unpack the data Scalar V0 = VF0.x; @@ -227,8 +199,6 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force, \param ntypes Number of particle types in the system \param table_width Number of points in each table \param block_size Block size at which to run the kernel - \param compute_capability Compute capability of the device (200, 300, 350) - \param max_tex1d_width Maximum width of a linear 1d texture \note This is just a kernel driver. See gpu_compute_table_forces_kernel for full documentation. */ @@ -248,8 +218,6 @@ cudaError_t gpu_compute_table_forces(Scalar4* d_force, const unsigned int ntypes, const unsigned int table_width, const unsigned int block_size, - const unsigned int compute_capability, - const unsigned int max_tex1d_width, const GPUPartition& gpu_partition) { assert(d_params); @@ -265,99 +233,37 @@ cudaError_t gpu_compute_table_forces(Scalar4* d_force, { auto range = gpu_partition.getRangeAndSetGPU(idev); - // texture bind - if (compute_capability < 350) + static unsigned int max_block_size = UINT_MAX; + if (max_block_size == UINT_MAX) { - // bind the pdata position texture - pdata_pos_tex.normalized = false; - pdata_pos_tex.filterMode = cudaFilterModePoint; - cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4) * (N+n_ghost)); - if (error != cudaSuccess) - return error; - - if (size_nlist <= max_tex1d_width) - { - nlist_tex.normalized = false; - nlist_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, nlist_tex, d_nlist, sizeof(unsigned int)*size_nlist); - if (error != cudaSuccess) - return error; - } - - // bind the tables texture - tables_tex.normalized = false; - tables_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_width * table_index.getNumElements()); - if (error != cudaSuccess) - return error; + cudaFuncAttributes attr; + cudaFuncGetAttributes(&attr, gpu_compute_table_forces_kernel); + max_block_size = attr.maxThreadsPerBlock; } - if (compute_capability < 350 && size_nlist > max_tex1d_width) - { // use global memory when the neighbor list must be texture bound, but exceeds the max size of a texture - static unsigned int max_block_size = UINT_MAX; - if (max_block_size == UINT_MAX) - { - cudaFuncAttributes attr; - cudaFuncGetAttributes(&attr, gpu_compute_table_forces_kernel<1>); - max_block_size = attr.maxThreadsPerBlock; - } - - unsigned int run_block_size = min(block_size, max_block_size); - - // setup the grid to run the kernel - dim3 grid( (range.second-range.first) / run_block_size + 1, 1, 1); - dim3 threads(run_block_size, 1, 1); - - gpu_compute_table_forces_kernel<1><<< grid, threads, sizeof(Scalar4)*table_index.getNumElements() >>>(d_force, - d_virial, - virial_pitch, - range.second-range.first, - d_pos, - box, - d_n_neigh, - d_nlist, - d_head_list, - d_tables, - d_params, - ntypes, - table_width, - range.first - ); - } - else - { - static unsigned int max_block_size = UINT_MAX; - if (max_block_size == UINT_MAX) - { - cudaFuncAttributes attr; - cudaFuncGetAttributes(&attr, gpu_compute_table_forces_kernel<0>); - max_block_size = attr.maxThreadsPerBlock; - } - - unsigned int run_block_size = min(block_size, max_block_size); - - // index calculation helper - Index2DUpperTriangular table_index(ntypes); - - // setup the grid to run the kernel - dim3 grid( (range.second-range.first) / run_block_size + 1, 1, 1); - dim3 threads(run_block_size, 1, 1); - - gpu_compute_table_forces_kernel<0><<< grid, threads, sizeof(Scalar4)*table_index.getNumElements() >>>(d_force, - d_virial, - virial_pitch, - range.second-range.first, - d_pos, - box, - d_n_neigh, - d_nlist, - d_head_list, - d_tables, - d_params, - ntypes, - table_width, - range.first); - } + unsigned int run_block_size = min(block_size, max_block_size); + + // index calculation helper + Index2DUpperTriangular table_index(ntypes); + + // setup the grid to run the kernel + dim3 grid( (range.second-range.first) / run_block_size + 1, 1, 1); + dim3 threads(run_block_size, 1, 1); + + gpu_compute_table_forces_kernel<<< grid, threads, sizeof(Scalar4)*table_index.getNumElements() >>>(d_force, + d_virial, + virial_pitch, + range.second-range.first, + d_pos, + box, + d_n_neigh, + d_nlist, + d_head_list, + d_tables, + d_params, + ntypes, + table_width, + range.first); } return cudaSuccess; } diff --git a/hoomd/md/TablePotentialGPU.cuh b/hoomd/md/TablePotentialGPU.cuh index 28beeda291..f33fa1bb74 100644 --- a/hoomd/md/TablePotentialGPU.cuh +++ b/hoomd/md/TablePotentialGPU.cuh @@ -33,8 +33,6 @@ cudaError_t gpu_compute_table_forces(Scalar4* d_force, const unsigned int ntypes, const unsigned int table_width, const unsigned int block_size, - const unsigned int compute_capability, - const unsigned int max_tex1d_width, const GPUPartition& gpu_partition); #endif diff --git a/hoomd/metal/EAMForceComputeGPU.cc b/hoomd/metal/EAMForceComputeGPU.cc index 059210fe78..cc6824788a 100644 --- a/hoomd/metal/EAMForceComputeGPU.cc +++ b/hoomd/metal/EAMForceComputeGPU.cc @@ -102,8 +102,7 @@ void EAMForceComputeGPU::computeForces(unsigned int timestep) eam_data.block_size = m_tuner->getParam(); gpu_compute_eam_tex_inter_forces(d_force.data, d_virial.data, m_virial.getPitch(), m_pdata->getN(), d_pos.data, box, d_n_neigh.data, d_nlist.data, d_head_list.data, this->m_nlist->getNListArray().getPitch(), eam_data, - d_dFdP.data, d_F.data, d_rho.data, d_rphi.data, d_dF.data, d_drho.data, d_drphi.data, - m_exec_conf->getComputeCapability() / 10, m_exec_conf->dev_prop.maxTexture1DLinear); + d_dFdP.data, d_F.data, d_rho.data, d_rphi.data, d_dF.data, d_drho.data, d_drphi.data); if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR(); diff --git a/hoomd/metal/EAMForceGPU.cu b/hoomd/metal/EAMForceGPU.cu index ddfaca2e9d..b4cd7c31e0 100644 --- a/hoomd/metal/EAMForceGPU.cu +++ b/hoomd/metal/EAMForceGPU.cu @@ -13,25 +13,10 @@ \brief Defines GPU kernel code for calculating the EAM forces. Used by EAMForceComputeGPU. */ -//! Texture for reading particle positions -scalar4_tex_t pdata_pos_tex; -//! Texture for reading the neighbor list -texture nlist_tex; -//! Texture for reading potential -scalar4_tex_t tex_F; -scalar4_tex_t tex_rho; -scalar4_tex_t tex_rphi; -scalar4_tex_t tex_dF; -scalar4_tex_t tex_drho; -scalar4_tex_t tex_drphi; -//! Texture for dF/dP -scalar_tex_t tex_dFdP; - //! Storage space for EAM parameters on the GPU __constant__ EAMTexInterData eam_data_ti; //! Kernel for computing EAM forces on the GPU -template __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned int virial_pitch, const unsigned int N, const Scalar4 *d_pos, BoxDim box, const unsigned int *d_n_neigh, const unsigned int *d_nlist, const unsigned int *d_head_list, const Scalar4 *d_F, const Scalar4 *d_rho, const Scalar4 *d_rphi, @@ -49,7 +34,7 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned const unsigned int head_idx = d_head_list[idx]; // read in the position of our particle. - Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx); + Scalar4 postype = __ldg(d_pos + idx); Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z); // index and remainder @@ -65,14 +50,8 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned // prefetch neighbor index int cur_neigh = 0; int next_neigh(0); - if (use_gmem_nlist) - { - next_neigh = d_nlist[head_idx]; - } - else - { - next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx); - } + next_neigh = __ldg(d_nlist + head_idx); + int typei = __scalar_as_int(postype.w); // loop over neighbors @@ -89,17 +68,10 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned // read the current neighbor index // prefetch the next value and set the current one cur_neigh = next_neigh; - if (use_gmem_nlist) - { - next_neigh = d_nlist[head_idx + neigh_idx + 1]; - } - else - { - next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + 1); - } + next_neigh = __ldg(d_nlist + head_idx + neigh_idx + 1); // get the neighbor's position - Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh); + Scalar4 neigh_postype = __ldg(d_pos + cur_neigh); Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z); // calculate dr (with periodic boundary conditions) @@ -120,7 +92,7 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned remainder = position - int_position; // calculate P = sum{rho} idxs = int_position + nr * (typej * ntypes + typei); - v = texFetchScalar4(d_rho, tex_rho, idxs); + v = __ldg(d_rho + idxs); atomElectronDensity += v.w + v.z * remainder + v.y * remainder * remainder + v.x * remainder * remainder * remainder; } @@ -133,8 +105,8 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned remainder = position - int_position; idxs = int_position + typei * nrho; - dv = texFetchScalar4(d_dF, tex_dF, idxs); - v = texFetchScalar4(d_F, tex_F, idxs); + dv = __ldg(d_dF + idxs); + v = __ldg(d_F + idxs); // compute dF / dP d_dFdP[idx] = dv.z + dv.y * remainder + dv.x * remainder * remainder; // compute embedded energy F(P), sum up each particle @@ -145,7 +117,6 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned } //! Second stage kernel for computing EAM forces on the GPU -template __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned int virial_pitch, const unsigned int N, const Scalar4 *d_pos, BoxDim box, const unsigned int *d_n_neigh, const unsigned int *d_nlist, const unsigned int *d_head_list, const Scalar4 *d_F, const Scalar4 *d_rho, const Scalar4 *d_rphi, @@ -163,7 +134,7 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned const unsigned int head_idx = d_head_list[idx]; // read in the position of our particle. Texture reads of Scalar4's are faster than global reads - Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx); + Scalar4 postype = __ldg(d_pos + idx); Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z); int typei = __scalar_as_int(postype.w); @@ -177,14 +148,8 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned // prefetch neighbor index int cur_neigh = 0; int next_neigh(0); - if (use_gmem_nlist) - { - next_neigh = d_nlist[head_idx]; - } - else - { - next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx); - } + next_neigh = __ldg(d_nlist + head_idx); + //Scalar4 force = force_data.force[idx]; Scalar4 force = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0)); //force.w = force_data.force[idx].w; @@ -202,21 +167,14 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned int nr = eam_data_ti.nr; Scalar rdr = eam_data_ti.rdr; Scalar r_cutsq = eam_data_ti.r_cutsq; - Scalar d_dFdPidx = texFetchScalar(d_dFdP, tex_dFdP, idx); + Scalar d_dFdPidx = __ldg(d_dFdP + idx); for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++) { cur_neigh = next_neigh; - if (use_gmem_nlist) - { - next_neigh = d_nlist[head_idx + neigh_idx + 1]; - } - else - { - next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + 1); - } + next_neigh = __ldg(d_nlist + head_idx + neigh_idx + 1); // get the neighbor's position - Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh); + Scalar4 neigh_postype = __ldg(d_pos + cur_neigh); Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z); // calculate dr (with periodic boundary conditions) @@ -245,8 +203,8 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned (int) (0.5 * (2 * ntypes - typei - 1) * typei + typej) * nr; idxs = int_position + shift; - v = texFetchScalar4(d_rphi, tex_rphi, idxs); - dv = texFetchScalar4(d_drphi, tex_drphi, idxs); + v = __ldg(d_rphi + idxs); + dv = __ldg(d_drphi + idxs); // aspair_potential = r * phi Scalar aspair_potential = v.w + v.z * remainder + v.y * remainder * remainder + v.x * remainder * remainder * remainder; @@ -258,14 +216,14 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned Scalar derivativePhi = (derivative_pair_potential - pair_eng) * inverseR; // derivativeRhoI = drho / dr of i idxs = int_position + typei * ntypes * nr + typej * nr; - dv = texFetchScalar4(d_drho, tex_drho, idxs); + dv = __ldg(d_drho + idxs); Scalar derivativeRhoI = dv.z + dv.y * remainder + dv.x * remainder * remainder; // derivativeRhoJ = drho / dr of j idxs = int_position + typej * ntypes * nr + typei * nr; - dv = texFetchScalar4(d_drho, tex_drho, idxs); + dv = __ldg(d_drho + idxs); Scalar derivativeRhoJ = dv.z + dv.y * remainder + dv.x * remainder * remainder; // fullDerivativePhi = dF/dP * drho / dr for j + dF/dP * drho / dr for j + phi - Scalar d_dFdPcur = texFetchScalar(d_dFdP, tex_dFdP, cur_neigh); + Scalar d_dFdPcur = __ldg(d_dFdP + cur_neigh); Scalar fullDerivativePhi = d_dFdPidx * derivativeRhoJ + d_dFdPcur * derivativeRhoI + derivativePhi; // compute forces pairForce = -fullDerivativePhi * inverseR; @@ -301,141 +259,39 @@ cudaError_t gpu_compute_eam_tex_inter_forces(Scalar4 *d_force, Scalar *d_virial, const unsigned int N, const Scalar4 *d_pos, const BoxDim &box, const unsigned int *d_n_neigh, const unsigned int *d_nlist, const unsigned int *d_head_list, const unsigned int size_nlist, const EAMTexInterData &eam_data, Scalar *d_dFdP, const Scalar4 *d_F, const Scalar4 *d_rho, - const Scalar4 *d_rphi, const Scalar4 *d_dF, const Scalar4 *d_drho, const Scalar4 *d_drphi, - const unsigned int compute_capability, const unsigned int max_tex1d_width) + const Scalar4 *d_rphi, const Scalar4 *d_dF, const Scalar4 *d_drho, const Scalar4 *d_drphi) { - cudaError_t error; - - // bind the texture - if (compute_capability < 350 && size_nlist <= max_tex1d_width) - { - nlist_tex.normalized = false; - nlist_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, nlist_tex, d_nlist, sizeof(unsigned int) * size_nlist); - if (error != cudaSuccess) - return error; - } - - if (compute_capability < 350) - { - tex_F.normalized = false; - tex_F.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, tex_F, d_F, sizeof(Scalar4) * eam_data.nrho * eam_data.ntypes); - if (error != cudaSuccess) - return error; - - tex_dF.normalized = false; - tex_dF.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, tex_dF, d_dF, sizeof(Scalar4) * eam_data.nrho * eam_data.ntypes); - if (error != cudaSuccess) - return error; - - tex_rho.normalized = false; - tex_rho.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, tex_rho, d_rho, sizeof(Scalar4) * eam_data.nrho * eam_data.ntypes * eam_data.ntypes); - if (error != cudaSuccess) - return error; - - tex_drho.normalized = false; - tex_drho.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, tex_drho, d_drho, - sizeof(Scalar4) * eam_data.nrho * eam_data.ntypes * eam_data.ntypes); - if (error != cudaSuccess) - return error; - - tex_rphi.normalized = false; - tex_rphi.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, tex_rphi, d_rphi, - sizeof(Scalar4) * (int) (0.5 * eam_data.nr * (eam_data.ntypes + 1) * eam_data.ntypes)); - if (error != cudaSuccess) - return error; - - tex_drphi.normalized = false; - tex_drphi.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, tex_drphi, d_drphi, - sizeof(Scalar4) * (int) (0.5 * eam_data.nr * (eam_data.ntypes + 1) * eam_data.ntypes)); - if (error != cudaSuccess) - return error; - } - - pdata_pos_tex.normalized = false; - pdata_pos_tex.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4) * N); - if (error != cudaSuccess) - return error; - - tex_dFdP.normalized = false; - tex_dFdP.filterMode = cudaFilterModePoint; - error = cudaBindTexture(0, tex_dFdP, d_dFdP, sizeof(Scalar) * N); - if (error != cudaSuccess) - return error; - // run the kernel cudaMemcpyToSymbol(eam_data_ti, &eam_data, sizeof(EAMTexInterData)); - if (compute_capability < 350 && size_nlist > max_tex1d_width) - { - - static unsigned int max_block_size_1 = UINT_MAX; - static unsigned int max_block_size_2 = UINT_MAX; + static unsigned int max_block_size_1 = UINT_MAX; + static unsigned int max_block_size_2 = UINT_MAX; - cudaFuncAttributes attr1; - cudaFuncGetAttributes(&attr1, gpu_kernel_1<1>); + cudaFuncAttributes attr1; + cudaFuncGetAttributes(&attr1, gpu_kernel_1); - cudaFuncAttributes attr2; - cudaFuncGetAttributes(&attr2, gpu_kernel_2<1>); + cudaFuncAttributes attr2; + cudaFuncGetAttributes(&attr2, gpu_kernel_2); - max_block_size_1 = attr1.maxThreadsPerBlock; - max_block_size_2 = attr2.maxThreadsPerBlock; + max_block_size_1 = attr1.maxThreadsPerBlock; + max_block_size_2 = attr2.maxThreadsPerBlock; - unsigned int run_block_size_1 = min(eam_data.block_size, max_block_size_1); - unsigned int run_block_size_2 = min(eam_data.block_size, max_block_size_2); + unsigned int run_block_size_1 = min(eam_data.block_size, max_block_size_1); + unsigned int run_block_size_2 = min(eam_data.block_size, max_block_size_2); - // setup the grid to run the kernel + // setup the grid to run the kernel - dim3 grid_1((int) ceil((double) N / (double) run_block_size_1), 1, 1); - dim3 threads_1(run_block_size_1, 1, 1); + dim3 grid_1((int) ceil((double) N / (double) run_block_size_1), 1, 1); + dim3 threads_1(run_block_size_1, 1, 1); - dim3 grid_2((int) ceil((double) N / (double) run_block_size_2), 1, 1); - dim3 threads_2(run_block_size_2, 1, 1); + dim3 grid_2((int) ceil((double) N / (double) run_block_size_2), 1, 1); + dim3 threads_2(run_block_size_2, 1, 1); - gpu_kernel_1<1> <<>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist, - d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP); - gpu_kernel_2<1> <<>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist, - d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP); - } - else - { - - static unsigned int max_block_size_1 = UINT_MAX; - static unsigned int max_block_size_2 = UINT_MAX; - - cudaFuncAttributes attr1; - cudaFuncGetAttributes(&attr1, gpu_kernel_1<0>); - - cudaFuncAttributes attr2; - cudaFuncGetAttributes(&attr2, gpu_kernel_2<0>); - - max_block_size_1 = attr1.maxThreadsPerBlock; - max_block_size_2 = attr2.maxThreadsPerBlock; - - unsigned int run_block_size_1 = min(eam_data.block_size, max_block_size_1); - unsigned int run_block_size_2 = min(eam_data.block_size, max_block_size_2); - - // setup the grid to run the kernel - - dim3 grid_1((int) ceil((double) N / (double) run_block_size_1), 1, 1); - dim3 threads_1(run_block_size_1, 1, 1); - - dim3 grid_2((int) ceil((double) N / (double) run_block_size_2), 1, 1); - dim3 threads_2(run_block_size_2, 1, 1); - - gpu_kernel_1<0> <<>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist, - d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP); - gpu_kernel_2<0> <<>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist, - d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP); - } + gpu_kernel_1<<>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist, + d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP); + gpu_kernel_2<<>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist, + d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP); return cudaSuccess; } diff --git a/hoomd/metal/EAMForceGPU.cuh b/hoomd/metal/EAMForceGPU.cuh index 6303691fcf..d76744b1a3 100644 --- a/hoomd/metal/EAMForceGPU.cuh +++ b/hoomd/metal/EAMForceGPU.cuh @@ -35,7 +35,6 @@ cudaError_t gpu_compute_eam_tex_inter_forces(Scalar4* d_force, Scalar* d_virial, const unsigned int N, const Scalar4 *d_pos, const BoxDim& box, const unsigned int *d_n_neigh, const unsigned int *d_nlist, const unsigned int *d_head_list, const unsigned int size_nlist, const EAMTexInterData& eam_data, Scalar *d_dFdP, const Scalar4 *d_F, const Scalar4 *d_rho, - const Scalar4 *d_rphi, const Scalar4 *d_dF, const Scalar4 *d_drho, const Scalar4 *d_drphi, - const unsigned int compute_capability, const unsigned int max_tex1d_width); + const Scalar4 *d_rphi, const Scalar4 *d_dF, const Scalar4 *d_drho, const Scalar4 *d_drphi); #endif