diff --git a/CMake/hoomd/HOOMDCUDASetup.cmake b/CMake/hoomd/HOOMDCUDASetup.cmake
index 73a5fe43b6..e7f5ed0196 100644
--- a/CMake/hoomd/HOOMDCUDASetup.cmake
+++ b/CMake/hoomd/HOOMDCUDASetup.cmake
@@ -53,9 +53,9 @@ if (ENABLE_CUDA)
 
     # setup nvcc to build for all CUDA architectures. Allow user to modify the list if desired
     if (CUDA_VERSION VERSION_GREATER 8.99)
-        set(CUDA_ARCH_LIST 30 35 50 60 70 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.")
+        set(CUDA_ARCH_LIST 35 50 60 70 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.")
     elseif (CUDA_VERSION VERSION_GREATER 7.99)
-        set(CUDA_ARCH_LIST 30 35 50 60 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.")
+        set(CUDA_ARCH_LIST 35 50 60 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.")
     endif()
 
     foreach(_cuda_arch ${CUDA_ARCH_LIST})
@@ -69,8 +69,8 @@ if (ENABLE_CUDA)
     list(GET _cuda_arch_list_sorted -1 _cuda_max_arch)
     add_definitions(-DCUDA_ARCH=${_cuda_min_arch})
 
-    if (_cuda_min_arch LESS 30)
-        message(SEND_ERROR "HOOMD requires compute 3.0 or newer")
+    if (_cuda_min_arch LESS 35)
+        message(SEND_ERROR "HOOMD requires compute 3.5 or newer")
     endif ()
 
     # only generate ptx code for the maximum supported CUDA_ARCH (saves on file size)
diff --git a/hoomd/TextureTools.h b/hoomd/TextureTools.h
index e82aeaf03a..bc35bf779e 100644
--- a/hoomd/TextureTools.h
+++ b/hoomd/TextureTools.h
@@ -10,161 +10,30 @@
 /*! \file TextureTools.h
     \brief Utilities for working with textures
 
-    TextureTools.h exists to aid in defining Scalar textures which may be either float or double. It aims to simplify
-    code that reads from these textures so that the amount of conditional code is simplified to be entirely within
-    this header.
+    TextureTools.h previously existed to aid in defining Scalar textures which may be either float or double.
 
-    Planning for the future (__ldg), the fetch methods will also take in a pointer to the memory. That way, the initial
-    work done to convert the texture loads over to the single/double will also make it easy to change over to __ldg
-    in a single spot.
+    Now, it only provides a __ldg() overload for double4.
 */
 
 #include "HOOMDMath.h"
 
 #ifdef NVCC
 
-//! Fetch an unsigned int from texture memory.
+//! Fetch a double4 value from texture memory.
 /*! This function should be called whenever a CUDA kernel wants to retrieve a
-    unsigned int value from texture memory.
+    double4 value from read only memory.
 
-    \param ptr Pointer to bound memory
-    \param tex_ref Texture in which the desired values are stored.
-    \param ii Index at which to look.
+    \param ptr Pointer to read
 */
-__device__ inline unsigned int texFetchUint(const unsigned int *ptr, texture<unsigned int, 1> tex_ref, unsigned int ii)
+__device__ inline double4 __ldg(const double4 *ptr)
     {
-    #if __CUDA_ARCH__ >= 350
-    return __ldg(ptr+ii);
-    #else
-    return tex1Dfetch(tex_ref, ii);
-    #endif
-    }
-
-#ifdef SINGLE_PRECISION
-
-typedef texture<Scalar, 1, cudaReadModeElementType> scalar_tex_t;
-typedef texture<Scalar2, 1, cudaReadModeElementType> scalar2_tex_t;
-typedef texture<Scalar4, 1, cudaReadModeElementType> scalar4_tex_t;
-
-//! Fetch a Scalar value from texture memory.
-/*! This function should be called whenever a CUDA kernel wants to retrieve a
-    Scalar value from texture memory.
-
-    \param ptr Pointer to bound memory
-    \param tex_ref Texture in which the desired values are stored.
-    \param ii Index at which to look.
-*/
-__device__ inline Scalar texFetchScalar(const Scalar *ptr, texture<Scalar, 1> tex_ref, unsigned int ii)
-    {
-    #if __CUDA_ARCH__ >= 350
-    return __ldg(ptr+ii);
-    #else
-    return tex1Dfetch(tex_ref, ii);
-    #endif
-    }
-
-//! Fetch a Scalar2 value from texture memory.
-/*! This function should be called whenever a CUDA kernel wants to retrieve a
-    Scalar2 value from texture memory.
-
-    \param ptr Pointer to bound memory
-    \param tex_ref Texture in which the desired values are stored.
-    \param ii Index at which to look.
-*/
-__device__ inline Scalar2 texFetchScalar2(const Scalar2 *ptr, texture<Scalar2, 1> tex_ref, unsigned int ii)
-    {
-    #if __CUDA_ARCH__ >= 350
-    return __ldg(ptr+ii);
-    #else
-    return tex1Dfetch(tex_ref, ii);
-    #endif
-    }
-
-//! Fetch a Scalar4 value from texture memory.
-/*! This function should called whenever a CUDA kernel wants to retrieve a
-    Scalar4 value from texture memory.
-
-    \param ptr Pointer to bound memory
-    \param tex_ref Texture in which the desired values are stored.
-    \param ii Index at which to look.
-*/
-__device__ inline Scalar4 texFetchScalar4(const Scalar4 *ptr, texture<Scalar4, 1> tex_ref, unsigned int ii)
-    {
-    #if __CUDA_ARCH__ >= 350
-    return __ldg(ptr+ii);
-    #else
-    return tex1Dfetch(tex_ref, ii);
-    #endif
-    }
-
-#else
-typedef texture<int2, 1, cudaReadModeElementType> scalar_tex_t;
-typedef texture<int4, 1, cudaReadModeElementType> scalar2_tex_t;
-typedef texture<int4, 1, cudaReadModeElementType> scalar4_tex_t;
-
-//! Fetch a Scalar value from texture memory.
-/*! This function should be called whenever a CUDA kernel wants to retrieve a
-    Scalar value from texture memory.
-
-    \param ptr Pointer to bound memory
-    \param tex_ref Texture in which the desired values are stored.
-    \param ii Index at which to look.
-*/
-__device__ inline Scalar texFetchScalar(const Scalar *ptr, texture<int2, 1> tex_ref, unsigned int ii)
-    {
-    #if __CUDA_ARCH__ >= 350
-    return __ldg(ptr+ii);
-    #else
-    int2 val = tex1Dfetch(tex_ref, ii);
-    return Scalar(__hiloint2double(val.y, val.x));
-    #endif
-    }
-
-//! Fetch a Scalar2 value from texture memory.
-/*! This function should be called whenever a CUDA kernel wants to retrieve a
-    Scalar2 value from texture memory.
-
-    \param ptr Pointer to bound memory
-    \param tex_ref Texture in which the desired values are stored.
-    \param ii Index at which to look.
-*/
-__device__ inline Scalar2 texFetchScalar2(const Scalar2* ptr, texture<int4, 1> tex_ref, unsigned int ii)
-    {
-    #if __CUDA_ARCH__ >= 350
-    return __ldg(ptr+ii);
-    #else
-    int4 val = tex1Dfetch(tex_ref, ii);
-    return make_scalar2(__hiloint2double(val.y, val.x),
-                        __hiloint2double(val.w, val.z));
-    #endif
-    }
-
-//! Fetch a Scalar4 value from texture memory.
-/*! This function should be called whenever a CUDA kernel wants to retrieve a
-    Scalar4 value from texture memory.
-
-    \param ptr Pointer to bound memory
-    \param tex_ref Texture in which the desired values are stored.
-    \param ii Index at which to look.
-*/
-__device__ inline Scalar4 texFetchScalar4(const Scalar4 *ptr, texture<int4, 1> tex_ref, unsigned int ii)
-    {
-    unsigned int idx = 2*ii;
-    #if __CUDA_ARCH__ >= 350
-    int4 part1 = __ldg(((int4 *)ptr)+idx);;
-    int4 part2 = __ldg(((int4 *)ptr)+idx+1);;
-    #else
-    int4 part1 = tex1Dfetch(tex_ref, idx);
-    int4 part2 = tex1Dfetch(tex_ref, idx+1);
-    #endif
-    return make_scalar4(__hiloint2double(part1.y, part1.x),
+    int4 part1 = __ldg(((int4 *)ptr));;
+    int4 part2 = __ldg(((int4 *)ptr)+1);;
+    return make_double4(__hiloint2double(part1.y, part1.x),
                         __hiloint2double(part1.w, part1.z),
                         __hiloint2double(part2.y, part2.x),
                         __hiloint2double(part2.w, part2.z));
     }
 #endif
-#endif
-
-
 
 #endif // __HOOMD_MATH_H__
diff --git a/hoomd/cgcmm/CGCMMAngleForceComputeGPU.cc b/hoomd/cgcmm/CGCMMAngleForceComputeGPU.cc
index bbda521137..03e49c01a5 100644
--- a/hoomd/cgcmm/CGCMMAngleForceComputeGPU.cc
+++ b/hoomd/cgcmm/CGCMMAngleForceComputeGPU.cc
@@ -133,8 +133,7 @@ void CGCMMAngleForceComputeGPU::computeForces(unsigned int timestep)
                                    d_CGCMMsr.data,
                                    d_CGCMMepow.data,
                                    m_CGCMMAngle_data->getNTypes(),
-                                   m_tuner->getParam(),
-                                   m_exec_conf->getComputeCapability());
+                                   m_tuner->getParam());
 
     if(m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
diff --git a/hoomd/cgcmm/CGCMMAngleForceGPU.cu b/hoomd/cgcmm/CGCMMAngleForceGPU.cu
index 5d4740580d..1842267da5 100644
--- a/hoomd/cgcmm/CGCMMAngleForceGPU.cu
+++ b/hoomd/cgcmm/CGCMMAngleForceGPU.cu
@@ -16,15 +16,6 @@
     \brief Defines GPU kernel code for calculating the CGCMM angle forces. Used by CGCMMAngleForceComputeGPU.
 */
 
-//! Texture for reading angle parameters
-scalar2_tex_t angle_params_tex;
-
-//! Texture for reading angle CGCMM S-R parameters
-scalar2_tex_t angle_CGCMMsr_tex; // MISSING EPSILON!!! sigma=.x, rcut=.y
-
-//! Texture for reading angle CGCMM Epsilon-pow/pref parameters
-scalar4_tex_t angle_CGCMMepow_tex; // now with EPSILON=.x, pow1=.y, pow2=.z, pref=.w
-
 //! Kernel for calculating CGCMM angle forces on the GPU
 /*! \param d_force Device memory to write computed forces
     \param d_virial Device memory to write computed virials
@@ -124,7 +115,7 @@ extern "C" __global__ void gpu_compute_CGCMM_angle_forces_kernel(Scalar4* d_forc
         dac = box.minImage(dac);
 
         // get the angle parameters (MEM TRANSFER: 8 bytes)
-        Scalar2 params = texFetchScalar2(d_params, angle_params_tex, cur_angle_type);
+        Scalar2 params = __ldg(d_params + cur_angle_type);
         Scalar K = params.x;
         Scalar t_0 = params.y;
 
@@ -154,14 +145,14 @@ extern "C" __global__ void gpu_compute_CGCMM_angle_forces_kernel(Scalar4* d_forc
             vac[i] = Scalar(0.0);
 
         // get the angle E-S-R parameters (MEM TRANSFER: 12 bytes)
-        const Scalar2 cgSR = texFetchScalar2(d_CGCMMsr, angle_CGCMMsr_tex, cur_angle_type);
+        const Scalar2 cgSR = __ldg(d_CGCMMsr + cur_angle_type);
 
         Scalar cgsigma = cgSR.x;
         Scalar cgrcut = cgSR.y;
 
         if (rac < cgrcut)
             {
-            const Scalar4 cgEPOW = texFetchScalar4(d_CGCMMepow, angle_CGCMMepow_tex, cur_angle_type);
+            const Scalar4 cgEPOW = __ldg(d_CGCMMepow + cur_angle_type);
 
             // get the angle pow/pref parameters (MEM TRANSFER: 12 bytes)
             Scalar cgeps = cgEPOW.x;
@@ -282,8 +273,7 @@ cudaError_t gpu_compute_CGCMM_angle_forces(Scalar4* d_force,
                                            Scalar2 *d_CGCMMsr,
                                            Scalar4 *d_CGCMMepow,
                                            unsigned int n_angle_types,
-                                           int block_size,
-                                           const unsigned int compute_capability)
+                                           int block_size)
     {
     assert(d_params);
     assert(d_CGCMMsr);
@@ -306,22 +296,6 @@ cudaError_t gpu_compute_CGCMM_angle_forces(Scalar4* d_force,
     dim3 grid( (int)ceil((double)N / (double)run_block_size), 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the textures on pre sm 35 arches
-    if (compute_capability < 350)
-        {
-        cudaError_t error = cudaBindTexture(0, angle_params_tex, d_params, sizeof(Scalar2) * n_angle_types);
-        if (error != cudaSuccess)
-            return error;
-
-        error = cudaBindTexture(0, angle_CGCMMsr_tex, d_CGCMMsr, sizeof(Scalar2) * n_angle_types);
-        if (error != cudaSuccess)
-            return error;
-
-        error = cudaBindTexture(0, angle_CGCMMepow_tex, d_CGCMMepow, sizeof(Scalar4) * n_angle_types);
-        if (error != cudaSuccess)
-            return error;
-        }
-
     // run the kernel
     gpu_compute_CGCMM_angle_forces_kernel<<< grid, threads>>>(d_force,
                                                               d_virial,
diff --git a/hoomd/cgcmm/CGCMMAngleForceGPU.cuh b/hoomd/cgcmm/CGCMMAngleForceGPU.cuh
index f1a8f2942a..e2bdc874d3 100644
--- a/hoomd/cgcmm/CGCMMAngleForceGPU.cuh
+++ b/hoomd/cgcmm/CGCMMAngleForceGPU.cuh
@@ -30,7 +30,6 @@ cudaError_t gpu_compute_CGCMM_angle_forces(Scalar4* d_force,
                                            Scalar2 *d_CGCMMsr,
                                            Scalar4 *d_CGCMMepow,
                                            unsigned int n_angle_types,
-                                           int block_size,
-                                           const unsigned int compute_capability);
+                                           int block_size);
 
 #endif
diff --git a/hoomd/cgcmm/CGCMMForceComputeGPU.cc b/hoomd/cgcmm/CGCMMForceComputeGPU.cc
index bde66b2037..d039b2c356 100644
--- a/hoomd/cgcmm/CGCMMForceComputeGPU.cc
+++ b/hoomd/cgcmm/CGCMMForceComputeGPU.cc
@@ -165,9 +165,7 @@ void CGCMMForceComputeGPU::computeForces(unsigned int timestep)
                              this->m_nlist->getNListArray().getPitch(),
                              m_pdata->getNTypes(),
                              m_r_cut * m_r_cut,
-                             m_block_size,
-                             m_exec_conf->getComputeCapability()/10,
-                             m_exec_conf->dev_prop.maxTexture1DLinear);
+                             m_block_size);
     if (m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
 
diff --git a/hoomd/cgcmm/CGCMMForceGPU.cu b/hoomd/cgcmm/CGCMMForceGPU.cu
index 16714d2120..066f17022c 100644
--- a/hoomd/cgcmm/CGCMMForceGPU.cu
+++ b/hoomd/cgcmm/CGCMMForceGPU.cu
@@ -13,11 +13,6 @@
     \brief Defines GPU kernel code for calculating the Lennard-Jones pair forces. Used by CGCMMForceComputeGPU.
 */
 
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-//! Texture for reading the neighbor list
-texture<unsigned int, 1, cudaReadModeElementType> nlist_tex;
-
 //! Kernel for calculating CG-CMM Lennard-Jones forces
 /*! This kernel is called to calculate the Lennard-Jones forces on all N particles for the CG-CMM model potential.
 
@@ -45,7 +40,6 @@ texture<unsigned int, 1, cudaReadModeElementType> nlist_tex;
     Each thread will calculate the total force on one particle.
     The neighborlist is arranged in columns so that reads are fully coalesced when doing this.
 */
-template<unsigned char use_gmem_nlist>
 __global__ void gpu_compute_cgcmm_forces_kernel(Scalar4* d_force,
                                                 Scalar* d_virial,
                                                 const unsigned int virial_pitch,
@@ -80,7 +74,7 @@ __global__ void gpu_compute_cgcmm_forces_kernel(Scalar4* d_force,
 
     // read in the position of our particle.
     // (MEM TRANSFER: 16 bytes)
-    Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx);
+    Scalar4 postype = __ldg(d_pos + idx);
     Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z);
 
     // initialize the force to 0
@@ -92,14 +86,7 @@ __global__ void gpu_compute_cgcmm_forces_kernel(Scalar4* d_force,
     // prefetch neighbor index
     unsigned int cur_neigh = 0;
     unsigned int next_neigh(0);
-    if (use_gmem_nlist)
-        {
-        next_neigh = d_nlist[head_idx];
-        }
-    else
-        {
-        next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx);
-        }
+    next_neigh = __ldg(d_nlist + head_idx);
 
     // loop over neighbors
     for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++)
@@ -107,17 +94,10 @@ __global__ void gpu_compute_cgcmm_forces_kernel(Scalar4* d_force,
         // read the current neighbor index (MEM TRANSFER: 4 bytes)
         // prefetch the next value and set the current one
         cur_neigh = next_neigh;
-        if (use_gmem_nlist)
-            {
-            next_neigh = d_nlist[head_idx + neigh_idx + 1];
-            }
-        else
-            {
-            next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx+1);
-            }
+        next_neigh = __ldg(d_nlist + head_idx + neigh_idx+1);
 
         // get the neighbor's position (MEM TRANSFER: 16 bytes)
-        Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh);
+        Scalar4 neigh_postype = __ldg(d_pos + cur_neigh);
         Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z);
 
         // calculate dr (with periodic boundary conditions)
@@ -212,9 +192,7 @@ cudaError_t gpu_compute_cgcmm_forces(Scalar4* d_force,
                                      const unsigned int size_nlist,
                                      const unsigned int coeff_width,
                                      const Scalar r_cutsq,
-                                     const unsigned int block_size,
-                                     const unsigned int compute_capability,
-                                     const unsigned int max_tex1d_width)
+                                     const unsigned int block_size)
     {
     assert(d_coeffs);
     assert(coeff_width > 0);
@@ -223,56 +201,18 @@ cudaError_t gpu_compute_cgcmm_forces(Scalar4* d_force,
     dim3 grid( (int)ceil((double)N / (double)block_size), 1, 1);
     dim3 threads(block_size, 1, 1);
 
-    // bind the texture
-    if (compute_capability < 35)
-        {
-        pdata_pos_tex.normalized = false;
-        pdata_pos_tex.filterMode = cudaFilterModePoint;
-        cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*N);
-        if (error != cudaSuccess)
-            return error;
-
-        if (size_nlist <= max_tex1d_width)
-            {
-            nlist_tex.normalized = false;
-            nlist_tex.filterMode = cudaFilterModePoint;
-            error = cudaBindTexture(0, nlist_tex, d_nlist, sizeof(unsigned int)*size_nlist);
-            if (error != cudaSuccess)
-                return error;
-            }
-        }
-
-    // run the kernel
-    if (compute_capability < 35 && size_nlist > max_tex1d_width)
-        { // fall back to slow global loads when the neighbor list is too big for texture memory
-        gpu_compute_cgcmm_forces_kernel<1><<< grid, threads, sizeof(Scalar4)*coeff_width*coeff_width >>>(d_force,
-                                                                                                      d_virial,
-                                                                                                      virial_pitch,
-                                                                                                      N,
-                                                                                                      d_pos,
-                                                                                                      box,
-                                                                                                      d_n_neigh,
-                                                                                                      d_nlist,
-                                                                                                      d_head_list,
-                                                                                                      d_coeffs,
-                                                                                                      coeff_width,
-                                                                                                      r_cutsq);
-        }
-    else
-        {
-        gpu_compute_cgcmm_forces_kernel<0><<< grid, threads, sizeof(Scalar4)*coeff_width*coeff_width >>>(d_force,
-                                                                                                      d_virial,
-                                                                                                      virial_pitch,
-                                                                                                      N,
-                                                                                                      d_pos,
-                                                                                                      box,
-                                                                                                      d_n_neigh,
-                                                                                                      d_nlist,
-                                                                                                      d_head_list,
-                                                                                                      d_coeffs,
-                                                                                                      coeff_width,
-                                                                                                      r_cutsq);
-        }
+    gpu_compute_cgcmm_forces_kernel<<< grid, threads, sizeof(Scalar4)*coeff_width*coeff_width >>>(d_force,
+                                                                                                  d_virial,
+                                                                                                  virial_pitch,
+                                                                                                  N,
+                                                                                                  d_pos,
+                                                                                                  box,
+                                                                                                  d_n_neigh,
+                                                                                                  d_nlist,
+                                                                                                  d_head_list,
+                                                                                                  d_coeffs,
+                                                                                                  coeff_width,
+                                                                                                  r_cutsq);
 
     return cudaSuccess;
     }
diff --git a/hoomd/cgcmm/CGCMMForceGPU.cuh b/hoomd/cgcmm/CGCMMForceGPU.cuh
index eac20d6ef7..cd48bbd7e3 100644
--- a/hoomd/cgcmm/CGCMMForceGPU.cuh
+++ b/hoomd/cgcmm/CGCMMForceGPU.cuh
@@ -29,8 +29,6 @@ cudaError_t gpu_compute_cgcmm_forces(Scalar4* d_force,
                                      const unsigned int size_nlist,
                                      const unsigned int coeff_width,
                                      const Scalar r_cutsq,
-                                     const unsigned int block_size,
-                                     const unsigned int compute_capability,
-                                     const unsigned int max_tex1d_width);
+                                     const unsigned int block_size);
 
 #endif
diff --git a/hoomd/dem/DEM2DForceGPU.cu b/hoomd/dem/DEM2DForceGPU.cu
index c455c503f4..ab43f2d79b 100644
--- a/hoomd/dem/DEM2DForceGPU.cu
+++ b/hoomd/dem/DEM2DForceGPU.cu
@@ -26,12 +26,6 @@
   \brief Defines GPU kernel code for calculating conservative DEM pair forces. Used by DEM2DForceComputeGPU.
 */
 
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-scalar4_tex_t pdata_quat_tex;
-scalar_tex_t pdata_diam_tex;
-scalar4_tex_t pdata_velocity_tex;
-
 //! Kernel for calculating 2D DEM forces
 /*! This kernel is called to calculate the DEM forces for all N particles.
 
@@ -171,22 +165,22 @@ __global__ void gpu_compute_dem2d_forces_kernel(const Scalar4 *d_pos,
         const unsigned int myHead(d_head_list[partIdx]);
 
         // fetch position and orientation of this particle
-        const Scalar4 postype(texFetchScalar4(d_pos,pdata_pos_tex, partIdx));
+        const Scalar4 postype(__ldg(d_pos + partIdx));
         const vec3<Scalar> pos_i(postype.x, postype.y, 0);
         const unsigned int type_i(__scalar_as_int(postype.w));
-        const Scalar4 quati(texFetchScalar4(d_quat, pdata_quat_tex, partIdx));
+        const Scalar4 quati(__ldg(d_quat + partIdx));
         const quat<Real> quat_i(quati.x, vec3<Real>(quati.y, quati.z, quati.w));
 
         Scalar di = 0.0f;
         if (Evaluator::needsDiameter())
-            di = texFetchScalar(d_diam, pdata_diam_tex, partIdx);
+            di = __ldg(d_diam + partIdx);
         else
             di += 1.0f; //shut up compiler warning. Vestigial from HOOMD
 
         vec3<Scalar> vi;
         if (Evaluator::needsVelocity())
             vi = vec3<Scalar>(
-                texFetchScalar4(d_velocity, pdata_velocity_tex, partIdx));
+                __ldg(d_velocity + partIdx));
 
         for(unsigned int featureEpoch(0);
             featureEpoch < (numShapeVerts[type_i] + blockDim.x - 1)/blockDim.x; ++featureEpoch)
@@ -214,7 +208,7 @@ __global__ void gpu_compute_dem2d_forces_kernel(const Scalar4 *d_pos,
                     next_neigh = d_nlist[myHead + neigh_idx + 1];
 
                     // grab the position and type of the neighbor
-                    const Scalar4 neigh_postype(texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh));
+                    const Scalar4 neigh_postype(__ldg(d_pos + cur_neigh));
                     const unsigned int neigh_type(__scalar_as_int(neigh_postype.w));
                     const vec3<Scalar> neigh_pos(neigh_postype.x, neigh_postype.y, 0);
 
@@ -230,20 +224,20 @@ __global__ void gpu_compute_dem2d_forces_kernel(const Scalar4 *d_pos,
                     if (Evaluator::needsDiameter())
                         {
                         Scalar dj(0);
-                        dj = texFetchScalar(d_diam, pdata_diam_tex, cur_neigh);
+                        dj = __ldg(d_diam + cur_neigh);
                         evaluator.setDiameter(di, dj);
                         }
 
                     if(evaluator.withinCutoff(rsq,r_cutsq))
                         {
                         // fetch neighbor's orientation
-                        const Scalar4 neighQuatF(texFetchScalar4(d_quat, pdata_quat_tex, cur_neigh));
+                        const Scalar4 neighQuatF(__ldg(d_quat + cur_neigh));
                         const quat<Real> neighQuat(
                             neighQuatF.x, vec3<Real>(neighQuatF.y, neighQuatF.z, neighQuatF.w));
 
                         if (Evaluator::needsVelocity())
                             {
-                            Scalar4 vj(texFetchScalar4(d_velocity, pdata_velocity_tex, cur_neigh));
+                            Scalar4 vj(__ldg(d_velocity + cur_neigh));
                             evaluator.setVelocity(vi - vec3<Scalar>(vj));
                             }
 
@@ -417,28 +411,6 @@ cudaError_t gpu_compute_dem2d_forces(Scalar4* d_force,
 
     dim3 threads(numFeatures, particlesPerBlock, 2);
 
-    // bind the textures for position and orientation
-    pdata_pos_tex.normalized = false;
-    pdata_pos_tex.filterMode = cudaFilterModePoint;
-    cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*(N+n_ghosts));
-    if (error != cudaSuccess)
-        return error;
-    pdata_quat_tex.normalized = false;
-    pdata_quat_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, pdata_quat_tex, d_quat, sizeof(Scalar4)*(N+n_ghosts));
-    if (error != cudaSuccess)
-        return error;
-    pdata_diam_tex.normalized = false;
-    pdata_diam_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, pdata_diam_tex, d_diam, sizeof(Scalar)*(N+n_ghosts));
-    if (error != cudaSuccess)
-        return error;
-    pdata_velocity_tex.normalized = false;
-    pdata_velocity_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, pdata_velocity_tex, d_velocity, sizeof(Scalar4)*(N+n_ghosts));
-    if (error != cudaSuccess)
-        return error;
-
     // Calculate the amount of shared memory required
     size_t shmSize(vertexCount*sizeof(Real2) + n_shapes*2*sizeof(unsigned int) +
         particlesPerBlock*(sizeof(Real4) + 6*sizeof(Real)));
diff --git a/hoomd/dem/DEM3DForceGPU.cu b/hoomd/dem/DEM3DForceGPU.cu
index f905e6d0f4..2d77fa95af 100644
--- a/hoomd/dem/DEM3DForceGPU.cu
+++ b/hoomd/dem/DEM3DForceGPU.cu
@@ -27,12 +27,6 @@
   \brief Defines GPU kernel code for calculating conservative DEM pair forces. Used by DEM3DForceComputeGPU.
 */
 
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-scalar4_tex_t pdata_quat_tex;
-scalar_tex_t pdata_diam_tex;
-scalar4_tex_t pdata_velocity_tex;
-
 //! Kernel for calculating 3D DEM forces
 /*! This kernel is called to calculate the DEM forces for all N particles.
 
@@ -234,22 +228,22 @@ __global__ void gpu_compute_dem3d_forces_kernel(
         const unsigned int myHead(d_head_list[partIdx]);
 
         // fetch position and orientation of this particle
-        const Scalar4 postype(texFetchScalar4(d_pos,pdata_pos_tex, partIdx));
+        const Scalar4 postype(__ldg(d_pos + partIdx));
         const vec3<Scalar> pos_i(postype.x, postype.y, postype.z);
         const unsigned int type_i(__scalar_as_int(postype.w));
-        const Scalar4 quati(texFetchScalar4(d_quat, pdata_quat_tex, partIdx));
+        const Scalar4 quati(__ldg(d_quat + partIdx));
         const quat<Real> quat_i(quati.x, vec3<Real>(quati.y, quati.z, quati.w));
 
         Scalar di = 0.0f;
         if (Evaluator::needsDiameter())
-            di = texFetchScalar(d_diam, pdata_diam_tex, partIdx);
+            di = __ldg(d_diam + partIdx);
         else
             di += 1.0f; //shut up compiler warning. Vestigial from HOOMD
 
         vec3<Scalar> vi;
         if (Evaluator::needsVelocity())
             vi = vec3<Scalar>(
-                texFetchScalar4(d_velocity, pdata_velocity_tex, partIdx));
+                __ldg(d_velocity + partIdx));
 
         for(unsigned int featureEpoch(0);
             featureEpoch < (maxFeatures + blockDim.y - 1)/blockDim.y; ++featureEpoch)
@@ -283,7 +277,7 @@ __global__ void gpu_compute_dem3d_forces_kernel(
                 next_neigh = d_nlist[myHead + neigh_idx + 1];
 
                 // grab the position and type of the neighbor
-                const Scalar4 neigh_postype(texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh));
+                const Scalar4 neigh_postype(__ldg(d_pos + cur_neigh));
                 const unsigned int type_j(__scalar_as_int(neigh_postype.w));
                 const vec3<Scalar> neigh_pos(neigh_postype.x, neigh_postype.y, neigh_postype.z);
 
@@ -299,7 +293,7 @@ __global__ void gpu_compute_dem3d_forces_kernel(
                 Scalar dj(0);
                 if (Evaluator::needsDiameter())
                     {
-                    dj = texFetchScalar(d_diam, pdata_diam_tex, cur_neigh);
+                    dj = __ldg(d_diam + cur_neigh);
                     evaluator.setDiameter(di, dj);
                     }
                 else
@@ -308,13 +302,13 @@ __global__ void gpu_compute_dem3d_forces_kernel(
                 if(evaluator.withinCutoff(rsq, r_cutsq))
                     {
                     // fetch neighbor's orientation
-                    const Scalar4 neighQuatF(texFetchScalar4(d_quat, pdata_quat_tex, cur_neigh));
+                    const Scalar4 neighQuatF(__ldg(d_quat + cur_neigh));
                     const quat<Real> neighQuat(
                         neighQuatF.x, vec3<Real>(neighQuatF.y, neighQuatF.z, neighQuatF.w));
 
                     if (Evaluator::needsVelocity())
                         {
-                        Scalar4 vj(texFetchScalar4(d_velocity, pdata_velocity_tex, cur_neigh));
+                        Scalar4 vj(__ldg(d_velocity + cur_neigh));
                         evaluator.setVelocity(vi - vec3<Scalar>(vj));
                         }
 
@@ -515,28 +509,6 @@ cudaError_t gpu_compute_dem3d_forces(
 
     dim3 threads(particlesPerBlock, numFeatures, 1);
 
-    // bind the textures for position and orientation
-    pdata_pos_tex.normalized = false;
-    pdata_pos_tex.filterMode = cudaFilterModePoint;
-    cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*(N+n_ghosts));
-    if (error != cudaSuccess)
-        return error;
-    pdata_quat_tex.normalized = false;
-    pdata_quat_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, pdata_quat_tex, d_quat, sizeof(Scalar4)*(N+n_ghosts));
-    if (error != cudaSuccess)
-        return error;
-    pdata_diam_tex.normalized = false;
-    pdata_diam_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, pdata_diam_tex, d_diam, sizeof(Scalar)*(N+n_ghosts));
-    if (error != cudaSuccess)
-        return error;
-    pdata_velocity_tex.normalized = false;
-    pdata_velocity_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, pdata_velocity_tex, d_velocity, sizeof(Scalar4)*(N+n_ghosts));
-    if (error != cudaSuccess)
-        return error;
-
     // Calculate the amount of shared memory required
     const size_t shmSize(2*particlesPerBlock*sizeof(Real4) + 6*particlesPerBlock*sizeof(Real) + // forces, torques, virials per-particle
         2*numFaces*sizeof(unsigned int) + // face->next face, face->first vertex in face
diff --git a/hoomd/hpmc/ComputeFreeVolumeGPU.cuh b/hoomd/hpmc/ComputeFreeVolumeGPU.cuh
index db7a6126bc..0be95ecb6a 100644
--- a/hoomd/hpmc/ComputeFreeVolumeGPU.cuh
+++ b/hoomd/hpmc/ComputeFreeVolumeGPU.cuh
@@ -136,10 +136,6 @@ template< class Shape >
 cudaError_t gpu_hpmc_free_volume(const hpmc_free_volume_args_t &args, const typename Shape::param_type *d_params);
 
 #ifdef NVCC
-//! Texture for reading postype
-scalar4_tex_t free_volume_postype_tex;
-//! Texture for reading orientation
-scalar4_tex_t free_volume_orientation_tex;
 
 //! Compute the cell that a particle sits in
 __device__ inline unsigned int compute_cell_idx(const Scalar3 p,
@@ -329,18 +325,14 @@ __global__ void gpu_hpmc_free_volume_kernel(unsigned int n_sample,
             if (local_k < excell_size)
                 {
                 // read in position, and orientation of neighboring particle
-                #if ( __CUDA_ARCH__ > 300)
                 unsigned int j = __ldg(&d_excell_idx[excli(local_k, my_cell)]);
-                #else
-                unsigned int j = d_excell_idx[excli(local_k, my_cell)];
-                #endif
 
-                Scalar4 postype_j = texFetchScalar4(d_postype, free_volume_postype_tex, j);
+                Scalar4 postype_j = __ldg(d_postype + j);
                 Scalar4 orientation_j = make_scalar4(1,0,0,0);
                 unsigned int typ_j = __scalar_as_int(postype_j.w);
                 Shape shape_j(quat<Scalar>(orientation_j), s_params[typ_j]);
                 if (shape_j.hasOrientation())
-                    shape_j.orientation = quat<Scalar>(texFetchScalar4(d_orientation, free_volume_orientation_tex, j));
+                    shape_j.orientation = quat<Scalar>(__ldg(d_orientation + j));
 
                 // put particle j into the coordinate system of particle i
                 vec3<Scalar> r_ij = vec3<Scalar>(postype_j) - pos_i;
@@ -406,20 +398,6 @@ cudaError_t gpu_hpmc_free_volume(const hpmc_free_volume_args_t& args, const type
     assert(args.group_size <= 32);  // note, really should be warp size of the device
     assert(args.block_size%(args.stride*args.group_size)==0);
 
-
-    // bind the textures
-    free_volume_postype_tex.normalized = false;
-    free_volume_postype_tex.filterMode = cudaFilterModePoint;
-    cudaError_t error = cudaBindTexture(0, free_volume_postype_tex, args.d_postype, sizeof(Scalar4)*args.max_n);
-    if (error != cudaSuccess)
-        return error;
-
-    free_volume_orientation_tex.normalized = false;
-    free_volume_orientation_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, free_volume_orientation_tex, args.d_orientation, sizeof(Scalar4)*args.max_n);
-    if (error != cudaSuccess)
-        return error;
-
     // reset counters
     cudaMemsetAsync(args.d_n_overlap_all,0, sizeof(unsigned int), args.stream);
 
diff --git a/hoomd/hpmc/IntegratorHPMCMonoGPU.cu b/hoomd/hpmc/IntegratorHPMCMonoGPU.cu
index 61baf204d5..132876b31a 100644
--- a/hoomd/hpmc/IntegratorHPMCMonoGPU.cu
+++ b/hoomd/hpmc/IntegratorHPMCMonoGPU.cu
@@ -59,7 +59,7 @@ __global__ void gpu_hpmc_excell_kernel(unsigned int *d_excell_idx,
         for (unsigned int k = 0; k < neigh_cell_size; k++)
             {
             // read in the index of the new particle to add to our cell
-            unsigned int new_idx = tex1Dfetch(cell_idx_tex, cli(k, neigh_cell));
+            unsigned int new_idx = __ldg(d_cell_idx + cli(k, neigh_cell));
             d_excell_idx[excli(my_cell_size, my_cell)] = new_idx;
             my_cell_size++;
             }
@@ -100,13 +100,6 @@ cudaError_t gpu_hpmc_excell(unsigned int *d_excell_idx,
     dim3 threads(min(block_size, (unsigned int)max_block_size), 1, 1);
     dim3 grid(ci.getNumElements() / block_size + 1, 1, 1);
 
-    // bind the textures
-    cell_idx_tex.normalized = false;
-    cell_idx_tex.filterMode = cudaFilterModePoint;
-    cudaError_t error = cudaBindTexture(0, cell_idx_tex, d_cell_idx, sizeof(unsigned int)*cli.getNumElements());
-    if (error != cudaSuccess)
-        return error;
-
     gpu_hpmc_excell_kernel<<<grid, threads>>>(d_excell_idx,
                                               d_excell_size,
                                               excli,
diff --git a/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh b/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh
index 8762ec109b..d6bc445182 100644
--- a/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh
+++ b/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh
@@ -184,13 +184,6 @@ cudaError_t gpu_hpmc_shift(Scalar4 *d_postype,
  * Definition of function templates and templated GPU kernels
  */
 
-//! Texture for reading postype
-scalar4_tex_t postype_tex;
-//! Texture for reading orientation
-scalar4_tex_t orientation_tex;
-//! Texture for reading cell index data
-texture<unsigned int, 1, cudaReadModeElementType> cell_idx_tex;
-
 //! Device function to compute the cell that a particle sits in
 __device__ inline unsigned int computeParticleCell(const Scalar3& p,
                                                    const BoxDim& box,
@@ -279,9 +272,6 @@ __device__ inline unsigned int computeParticleCell(const Scalar3& p,
         - threadIdx.x is the offset within the current group
         - blockIdx.x runs enough blocks so that all active cells are covered
 
-    **Possible enhancements**
-        - Use __ldg and not tex1Dfetch on sm35
-
     \ingroup hpmc_kernels
 */
 template< class Shape >
@@ -471,17 +461,17 @@ __global__ void gpu_hpmc_mpmc_kernel(Scalar4 *d_postype,
 
         // select one of the particles randomly from the cell
         unsigned int my_cell_offset = hoomd::UniformIntDistribution(my_cell_size-1)(rng);
-        i = tex1Dfetch(cell_idx_tex, cli(my_cell_offset, my_cell));
+        i = __ldg(d_cell_idx + cli(my_cell_offset, my_cell));
 
         // read in the position and orientation of our particle.
-        Scalar4 postype_i = texFetchScalar4(d_postype, postype_tex, i);
+        Scalar4 postype_i = __ldg(d_postype + i);
         Scalar4 orientation_i = make_scalar4(1,0,0,0);
 
         unsigned int typ_i = __scalar_as_int(postype_i.w);
         Shape shape_i(quat<Scalar>(orientation_i), s_params[typ_i]);
 
         if (shape_i.hasOrientation())
-            orientation_i = texFetchScalar4(d_orientation, orientation_tex, i);
+            orientation_i = __ldg(d_orientation + i);
 
         shape_i.orientation = quat<Scalar>(orientation_i);
 
@@ -549,11 +539,7 @@ __global__ void gpu_hpmc_mpmc_kernel(Scalar4 *d_postype,
             unsigned int j, next_j = 0;
             if (k < excell_size)
                 {
-                #if (__CUDA_ARCH__ > 300)
                 next_j = __ldg(&d_excell_idx[excli(k, my_cell)]);
-                #else
-                next_j = d_excell_idx[excli(k, my_cell)];
-                #endif
                 }
 
             // add to the queue as long as the queue is not full, and we have not yet reached the end of our own list
@@ -577,15 +563,11 @@ __global__ void gpu_hpmc_mpmc_kernel(Scalar4 *d_postype,
 
                     if (k < excell_size)
                         {
-                        #if (__CUDA_ARCH__ > 300)
                         next_j = __ldg(&d_excell_idx[excli(k, my_cell)]);
-                        #else
-                        next_j = d_excell_idx[excli(k, my_cell)];
-                        #endif
                         }
 
                     // read in position, and orientation of neighboring particle
-                    postype_j = texFetchScalar4(d_postype, postype_tex, j);
+                    postype_j = __ldg(d_postype + j);
                     Shape shape_j(quat<Scalar>(orientation_j), s_params[__scalar_as_int(postype_j.w)]);
 
                     // put particle j into the coordinate system of particle i
@@ -645,12 +627,12 @@ __global__ void gpu_hpmc_mpmc_kernel(Scalar4 *d_postype,
             Shape shape_i(quat<Scalar>(s_orientation_group[check_group]), s_params[type_i]);
 
             // build shape j from global memory
-            postype_j = texFetchScalar4(d_postype, postype_tex, check_j);
+            postype_j = __ldg(d_postype + check_j);
             orientation_j = make_scalar4(1,0,0,0);
             unsigned int type_j = __scalar_as_int(postype_j.w);
             Shape shape_j(quat<Scalar>(orientation_j), s_params[type_j]);
             if (shape_j.hasOrientation())
-                shape_j.orientation = quat<Scalar>(texFetchScalar4(d_orientation, orientation_tex, check_j));
+                shape_j.orientation = quat<Scalar>(__ldg(d_orientation + check_j));
 
             // put particle j into the coordinate system of particle i
             r_ij = vec3<Scalar>(postype_j) - vec3<Scalar>(pos_i);
@@ -881,28 +863,6 @@ cudaError_t gpu_hpmc_update(const hpmc_args_t& args, const typename Shape::param
 
     dim3 grid( args.n_active_cells / n_groups + 1, 1, 1);
 
-    // bind the textures
-    postype_tex.normalized = false;
-    postype_tex.filterMode = cudaFilterModePoint;
-    cudaError_t error = cudaBindTexture(0, postype_tex, args.d_postype, sizeof(Scalar4)*args.max_n);
-    if (error != cudaSuccess)
-        return error;
-
-    if (args.has_orientation)
-        {
-        orientation_tex.normalized = false;
-        orientation_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, orientation_tex, args.d_orientation, sizeof(Scalar4)*args.max_n);
-        if (error != cudaSuccess)
-            return error;
-        }
-
-    cell_idx_tex.normalized = false;
-    cell_idx_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, cell_idx_tex, args.d_cell_idx, sizeof(Scalar4)*args.cli.getNumElements());
-    if (error != cudaSuccess)
-        return error;
-
     gpu_hpmc_mpmc_kernel<Shape><<<grid, threads, shared_bytes, args.stream>>>(args.d_postype,
                                                                  args.d_orientation,
                                                                  args.d_counters,
diff --git a/hoomd/hpmc/IntegratorHPMCMonoImplicitGPU.cuh b/hoomd/hpmc/IntegratorHPMCMonoImplicitGPU.cuh
index 77c492de43..9deb61359e 100644
--- a/hoomd/hpmc/IntegratorHPMCMonoImplicitGPU.cuh
+++ b/hoomd/hpmc/IntegratorHPMCMonoImplicitGPU.cuh
@@ -231,18 +231,6 @@ cudaError_t gpu_hpmc_implicit_accept_reject(const hpmc_implicit_args_t &args, co
  * Definition of function templates and templated GPU kernels
  */
 
-//! Texture for reading postype
-scalar4_tex_t implicit_postype_tex;
-//! Texture for reading orientation
-scalar4_tex_t implicit_orientation_tex;
-//! Texture for reading postype
-scalar4_tex_t implicit_postype_old_tex;
-//! Texture for reading orientation
-scalar4_tex_t implicit_orientation_old_tex;
-
-//! Texture for reading cell index data
-texture<unsigned int, 1, cudaReadModeElementType> implicit_cell_idx_tex;
-
 //! HPMC implicit count overlaps kernel
 /*! \param d_postype Particle positions and types by index
     \param d_orientation Particle orientation
@@ -425,7 +413,7 @@ __global__ void gpu_hpmc_implicit_count_overlaps_kernel(Scalar4 *d_postype,
     if (active)
         {
         // load updated particle position
-        postype_i = texFetchScalar4(d_postype, implicit_postype_tex, idx_i);
+        postype_i = __ldg(d_postype + idx_i);
         type_i = __scalar_as_int(postype_i.w);
         pos_i = vec3<Scalar>(postype_i);
         }
@@ -458,7 +446,7 @@ __global__ void gpu_hpmc_implicit_count_overlaps_kernel(Scalar4 *d_postype,
     Shape shape_i(quat<Scalar>(orientation_i), s_params[__scalar_as_int(postype_i.w)]);
     if (shape_i.hasOrientation())
         {
-        orientation_i = texFetchScalar4(d_orientation, implicit_orientation_tex, idx_i);
+        orientation_i = __ldg(d_orientation + idx_i);
         shape_i.orientation = quat<Scalar>(orientation_i);
         }
 
@@ -528,18 +516,14 @@ __global__ void gpu_hpmc_implicit_count_overlaps_kernel(Scalar4 *d_postype,
                     Scalar4 postype_j;
                     do {
                         // read in position, and orientation of neighboring particle
-                        #if (__CUDA_ARCH__ > 300)
                         j = __ldg(&d_excell_idx[excli(local_k, my_cell)]);
-                        #else
-                        j = d_excell_idx[excli(local_k, my_cell)];
-                        #endif
 
                         // check against neighbor
-                        postype_j = texFetchScalar4(d_postype_old, implicit_postype_old_tex, j);
+                        postype_j = __ldg(d_postype_old + j);
                         Shape shape_j(quat<Scalar>(), s_params[__scalar_as_int(postype_j.w)]);
                         if (shape_j.hasOrientation())
                             {
-                            shape_j.orientation = quat<Scalar>(texFetchScalar4(d_orientation_old, implicit_orientation_old_tex, j));
+                            shape_j.orientation = quat<Scalar>(__ldg(d_orientation_old + j));
                             }
 
                         // test depletant in sphere around new particle position
@@ -565,7 +549,7 @@ __global__ void gpu_hpmc_implicit_count_overlaps_kernel(Scalar4 *d_postype,
                         Shape shape_j(quat<Scalar>(), s_params[typ_j]);
                         if (shape_j.hasOrientation())
                             {
-                            shape_j.orientation = quat<Scalar>(texFetchScalar4(d_orientation_old, implicit_orientation_old_tex, j));
+                            shape_j.orientation = quat<Scalar>(__ldg(d_orientation_old + j));
                             }
 
                         if (s_check_overlaps[overlap_idx(depletant_type, typ_j)]
@@ -833,12 +817,12 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype,
     if (active)
         {
         // load updated particle position
-        postype_i = texFetchScalar4(d_postype, implicit_postype_tex, idx_i);
+        postype_i = __ldg(d_postype + idx_i);
 
         Shape shape_i(quat<Scalar>(), s_params[__scalar_as_int(postype_i.w)]);
         if (shape_i.hasOrientation())
             {
-            orientation_i = texFetchScalar4(d_orientation, implicit_orientation_tex, idx_i);
+            orientation_i = __ldg(d_orientation + idx_i);
             }
         }
 
@@ -851,7 +835,7 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype,
     vec3<Scalar> pos_i_old;
     if (active)
         {
-        pos_i_old = vec3<Scalar>(texFetchScalar4(d_postype_old, implicit_postype_old_tex, idx_i));
+        pos_i_old = vec3<Scalar>(__ldg(d_postype_old + idx_i));
         }
 
     unsigned int excell_size;
@@ -984,7 +968,7 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype,
         vec3<Scalar> r_ij = pos_i_old - pos_test;
         if (shape_i.hasOrientation())
             {
-            shape_i.orientation = quat<Scalar>(texFetchScalar4(d_orientation_old, implicit_orientation_old_tex, idx_i));
+            shape_i.orientation = quat<Scalar>(__ldg(d_orientation_old + idx_i));
             }
 
         // if depletant can be inserted in excluded volume at old (new) position, success
@@ -1056,11 +1040,7 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype,
             unsigned int j, next_j = 0;
             if (k < excell_size)
                 {
-                #if (__CUDA_ARCH__ > 300)
                 next_j = __ldg(&d_excell_idx[excli(k, my_cell)]);
-                #else
-                next_j = d_excell_idx[excli(k, my_cell)];
-                #endif
                 }
 
             // add to the queue as long as the queue is not full, and we have not yet reached the end of our own list
@@ -1082,15 +1062,11 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype,
 
                     if (k < excell_size)
                         {
-                        #if (__CUDA_ARCH__ > 300)
                         next_j = __ldg(&d_excell_idx[excli(k, my_cell)]);
-                        #else
-                        next_j = d_excell_idx[excli(k, my_cell)];
-                        #endif
                         }
 
                     // read in position, and orientation of neighboring particle
-                    postype_j = texFetchScalar4(d_postype_old, implicit_postype_old_tex, j);
+                    postype_j = __ldg(d_postype_old + j);
                     Shape shape_j(quat<Scalar>(), s_params[__scalar_as_int(postype_j.w)]);
 
                     // put particle j into the coordinate system of depletant
@@ -1148,12 +1124,12 @@ __global__ void gpu_hpmc_implicit_reinsert_kernel(Scalar4 *d_postype,
             Shape shape_i(quat<Scalar>(s_orientation_group[check_group]), s_params[depletant_type]);
 
             // build shape j from global memory
-            postype_j = texFetchScalar4(d_postype_old, implicit_postype_old_tex, check_j);
+            postype_j = __ldg(d_postype_old + check_j);
             orientation_j = make_scalar4(1,0,0,0);
             unsigned int typ_j = __scalar_as_int(postype_j.w);
             Shape shape_j(quat<Scalar>(orientation_j), s_params[typ_j]);
             if (shape_j.hasOrientation())
-                shape_j.orientation = quat<Scalar>(texFetchScalar4(d_orientation_old, implicit_orientation_old_tex, check_j));
+                shape_j.orientation = quat<Scalar>(__ldg(d_orientation_old + check_j));
 
             // put particle j into the coordinate system of particle i
             r_ij = vec3<Scalar>(postype_j) - vec3<Scalar>(pos_i);
@@ -1409,40 +1385,6 @@ struct CountOverlapsKernelLauncher
                 n_active_cells = args.n_active_cells;
                 }
 
-            // bind the textures
-            implicit_postype_tex.normalized = false;
-            implicit_postype_tex.filterMode = cudaFilterModePoint;
-            cudaError_t error = cudaBindTexture(0, implicit_postype_tex, args.d_postype, sizeof(Scalar4)*args.max_n);
-            if (error != cudaSuccess)
-                return error;
-
-            implicit_postype_old_tex.normalized = false;
-            implicit_postype_old_tex.filterMode = cudaFilterModePoint;
-            error = cudaBindTexture(0, implicit_postype_old_tex, args.d_postype_old, sizeof(Scalar4)*args.max_n);
-            if (error != cudaSuccess)
-                return error;
-
-            if (args.has_orientation)
-                {
-                implicit_orientation_tex.normalized = false;
-                implicit_orientation_tex.filterMode = cudaFilterModePoint;
-                error = cudaBindTexture(0, implicit_orientation_tex, args.d_orientation, sizeof(Scalar4)*args.max_n);
-                if (error != cudaSuccess)
-                    return error;
-
-                implicit_orientation_old_tex.normalized = false;
-                implicit_orientation_old_tex.filterMode = cudaFilterModePoint;
-                error = cudaBindTexture(0, implicit_orientation_old_tex, args.d_orientation_old, sizeof(Scalar4)*args.max_n);
-                if (error != cudaSuccess)
-                    return error;
-                }
-
-            implicit_cell_idx_tex.normalized = false;
-            implicit_cell_idx_tex.filterMode = cudaFilterModePoint;
-            error = cudaBindTexture(0, implicit_cell_idx_tex, args.d_cell_idx, sizeof(unsigned int)*args.cli.getNumElements());
-            if (error != cudaSuccess)
-                return error;
-
             unsigned int shared_bytes = args.num_types * (sizeof(typename Shape::param_type))
                 + args.overlap_idx.getNumElements() * sizeof(unsigned int);
 
diff --git a/hoomd/hpmc/IntegratorHPMCMonoImplicitNewGPU.cuh b/hoomd/hpmc/IntegratorHPMCMonoImplicitNewGPU.cuh
index 78666a505d..c75c49452a 100644
--- a/hoomd/hpmc/IntegratorHPMCMonoImplicitNewGPU.cuh
+++ b/hoomd/hpmc/IntegratorHPMCMonoImplicitNewGPU.cuh
@@ -188,15 +188,6 @@ cudaError_t gpu_hpmc_implicit_accept_reject_new(const hpmc_implicit_args_new_t &
  * Definition of function templates and templated GPU kernels
  */
 
-//! Texture for reading postype
-scalar4_tex_t depletants_postype_tex;
-//! Texture for reading orientation
-scalar4_tex_t depletants_orientation_tex;
-//! Texture for reading postype
-scalar4_tex_t depletants_postype_old_tex;
-//! Texture for reading orientation
-scalar4_tex_t depletants_orientation_old_tex;
-
 template< class Shape >
 __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype,
                                      Scalar4 *d_orientation,
@@ -347,7 +338,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype,
     if (i == UINT_MAX || !d_active_cell_accept[active_cell_idx]) return;
 
     // load updated particle position
-    Scalar4 postype_i = texFetchScalar4(d_postype, depletants_postype_tex, i);
+    Scalar4 postype_i = __ldg(d_postype + i);
     unsigned int type_i = __scalar_as_int(postype_i.w);
 
     curandState_t local_state;
@@ -403,7 +394,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype,
             Scalar r = Scalar(0.5)*d_max*fast::pow(r3,Scalar(1.0/3.0));
 
             // test depletant position around old configuration
-            Scalar4 postype_i_old = texFetchScalar4(d_postype_old, depletants_postype_old_tex, i);
+            Scalar4 postype_i_old = __ldg(d_postype_old + i);
             vec3<Scalar> pos_test = vec3<Scalar>(postype_i_old)+r*n;
 
             if (shape_test.hasOrientation())
@@ -418,7 +409,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype,
                 Shape shape_i(quat<Scalar>(orientation_i), s_params[type_i]);
                 if (shape_i.hasOrientation())
                     {
-                    orientation_i = texFetchScalar4(d_orientation, depletants_orientation_tex, i);
+                    orientation_i = __ldg(d_orientation + i);
                     shape_i.orientation = quat<Scalar>(orientation_i);
                     }
 
@@ -450,7 +441,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype,
                 Shape shape_i_old(quat<Scalar>(quat<Scalar>(orientation_i_old)), d_params[type_i]);
                 if (shape_i_old.hasOrientation())
                     {
-                    orientation_i_old = texFetchScalar4(d_orientation_old, depletants_orientation_old_tex, i);
+                    orientation_i_old = __ldg(d_orientation_old + i);
                     shape_i_old.orientation = quat<Scalar>(orientation_i_old);
                     }
 
@@ -508,11 +499,7 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype,
                 unsigned int j, next_j = 0;
                 if (k < excell_size)
                     {
-                    #if (__CUDA_ARCH__ > 300)
                     next_j = __ldg(&d_excell_idx[excli(k, my_cell)]);
-                    #else
-                    next_j = d_excell_idx[excli(k, my_cell)];
-                    #endif
                     }
 
                 // add to the queue as long as the queue is not full, and we have not yet reached the end of our own list
@@ -536,15 +523,11 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype,
 
                         if (k < excell_size)
                             {
-                            #if (__CUDA_ARCH__ > 300)
                             next_j = __ldg(&d_excell_idx[excli(k, my_cell)]);
-                            #else
-                            next_j = d_excell_idx[excli(k, my_cell)];
-                            #endif
                             }
 
                         // read in position, and orientation of neighboring particle
-                        postype_j = texFetchScalar4(d_postype, depletants_postype_tex, j);
+                        postype_j = __ldg(d_postype + j);
                         unsigned int type_j = __scalar_as_int(postype_j.w);
                         Shape shape_j(quat<Scalar>(orientation_j), s_params[type_j]);
 
@@ -603,12 +586,12 @@ __global__ void gpu_hpmc_insert_depletants_queue_kernel(Scalar4 *d_postype,
                 Shape shape_test(quat<Scalar>(s_orientation_group[check_group]), s_params[depletant_type]);
 
                 // build shape j from global memory
-                postype_j = texFetchScalar4(d_postype, depletants_postype_tex, check_j);
+                postype_j = __ldg(d_postype + check_j);
                 orientation_j = make_scalar4(1,0,0,0);
                 unsigned int type_j = __scalar_as_int(postype_j.w);
                 Shape shape_j(quat<Scalar>(orientation_j), s_params[type_j]);
                 if (shape_j.hasOrientation())
-                    shape_j.orientation = quat<Scalar>(texFetchScalar4(d_orientation, depletants_orientation_tex, check_j));
+                    shape_j.orientation = quat<Scalar>(__ldg(d_orientation + check_j));
 
                 // put particle j into the coordinate system of particle i
                 r_ij = vec3<Scalar>(postype_j) - vec3<Scalar>(pos_test);
@@ -813,34 +796,6 @@ cudaError_t gpu_hpmc_insert_depletants_queue(const hpmc_implicit_args_new_t& arg
     // 1 block per active cell
     dim3 grid( args.n_active_cells, 1, 1);
 
-    // bind the textures
-    depletants_postype_tex.normalized = false;
-    depletants_postype_tex.filterMode = cudaFilterModePoint;
-    cudaError_t error = cudaBindTexture(0, depletants_postype_tex, args.d_postype, sizeof(Scalar4)*args.max_n);
-    if (error != cudaSuccess)
-        return error;
-
-    depletants_postype_old_tex.normalized = false;
-    depletants_postype_old_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, depletants_postype_old_tex, args.d_postype_old, sizeof(Scalar4)*args.max_n);
-    if (error != cudaSuccess)
-        return error;
-
-    if (args.has_orientation)
-        {
-        depletants_orientation_tex.normalized = false;
-        depletants_orientation_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, depletants_orientation_tex, args.d_orientation, sizeof(Scalar4)*args.max_n);
-        if (error != cudaSuccess)
-            return error;
-
-        depletants_orientation_old_tex.normalized = false;
-        depletants_orientation_old_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, depletants_orientation_old_tex, args.d_orientation_old, sizeof(Scalar4)*args.max_n);
-        if (error != cudaSuccess)
-            return error;
-        }
-
     // reset counters
     cudaMemsetAsync(args.d_overlap_cell,0, sizeof(unsigned int)*args.n_active_cells, args.stream);
 
diff --git a/hoomd/md/AnisoPotentialPairGPU.cuh b/hoomd/md/AnisoPotentialPairGPU.cuh
index d726f0e2b6..b0a8c0c738 100644
--- a/hoomd/md/AnisoPotentialPairGPU.cuh
+++ b/hoomd/md/AnisoPotentialPairGPU.cuh
@@ -118,17 +118,6 @@ struct a_pair_args_t
     };
 
 #ifdef NVCC
-//! Texture for reading particle positions
-scalar4_tex_t aniso_pdata_pos_tex;
-
-//! Texture for reading particle quaternions
-scalar4_tex_t aniso_pdata_quat_tex;
-
-//! Texture for reading particle diameters
-scalar_tex_t aniso_pdata_diam_tex;
-
-//! Texture for reading particle charges
-scalar_tex_t aniso_pdata_charge_tex;
 
 //! Kernel for calculating pair forces
 /*! This kernel is called to calculate the pair forces on all N particles. Actual evaluation of the potentials and
@@ -223,18 +212,18 @@ __global__ void gpu_compute_pair_aniso_forces_kernel(Scalar4 *d_force,
 
     // read in the position of our particle. Texture reads of Scalar4's are faster than global reads on compute 1.0 hardware
     // (MEM TRANSFER: 16 bytes)
-    Scalar4 postypei = texFetchScalar4(d_pos, aniso_pdata_pos_tex, idx);
+    Scalar4 postypei = __ldg(d_pos + idx);
     Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z);
-    Scalar4 quati = texFetchScalar4(d_orientation,aniso_pdata_quat_tex, idx);
+    Scalar4 quati = __ldg(d_orientation + idx);
 
     Scalar di;
     if (evaluator::needsDiameter())
-        di = texFetchScalar(d_diameter, aniso_pdata_diam_tex, idx);
+        di = __ldg(d_diameter + idx);
     else
         di += 1.0f; // shut up compiler warning
     Scalar qi;
     if (evaluator::needsCharge())
-        qi = texFetchScalar(d_charge, aniso_pdata_charge_tex, idx);
+        qi = __ldg(d_charge + idx);
     else
         qi += 1.0f; // shut up compiler warning
 
@@ -265,19 +254,19 @@ __global__ void gpu_compute_pair_aniso_forces_kernel(Scalar4 *d_force,
                 next_j = d_nlist[myHead + neigh_idx + tpp];
 
             // get the neighbor's position (MEM TRANSFER: 16 bytes)
-            Scalar4 postypej = texFetchScalar4(d_pos, aniso_pdata_pos_tex, cur_j);
+            Scalar4 postypej = __ldg(d_pos + cur_j);
             Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z);
-            Scalar4 quatj = texFetchScalar4(d_orientation, aniso_pdata_quat_tex, cur_j);
+            Scalar4 quatj = __ldg(d_orientation + cur_j);
 
             Scalar dj = 0.0f;
             if (evaluator::needsDiameter())
-                dj = texFetchScalar(d_diameter, aniso_pdata_diam_tex, cur_j);
+                dj = __ldg(d_diameter + cur_j);
             else
                 dj += 1.0f; // shut up compiler warning
 
             Scalar qj = 0.0f;
             if (evaluator::needsCharge())
-                qj = texFetchScalar(d_charge, aniso_pdata_charge_tex, cur_j);
+                qj = __ldg(d_charge + cur_j);
             else
                 qj += 1.0f; // shut up compiler warning
 
@@ -413,28 +402,6 @@ int aniso_get_compute_capability(T func)
     return attr.binaryVersion;
     }
 
-void gpu_pair_aniso_force_bind_textures(const a_pair_args_t pair_args)
-    {
-    // bind the position texture
-    aniso_pdata_pos_tex.normalized = false;
-    aniso_pdata_pos_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, aniso_pdata_pos_tex, pair_args.d_pos, sizeof(Scalar4)*pair_args.n_max);
-
-    // bind the position texture
-    aniso_pdata_quat_tex.normalized = false;
-    aniso_pdata_quat_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, aniso_pdata_quat_tex, pair_args.d_orientation, sizeof(Scalar4)*pair_args.n_max);
-
-    // bind the diameter texture
-    aniso_pdata_diam_tex.normalized = false;
-    aniso_pdata_diam_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, aniso_pdata_diam_tex, pair_args.d_diameter, sizeof(Scalar) * pair_args.n_max);
-
-    aniso_pdata_charge_tex.normalized = false;
-    aniso_pdata_charge_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, aniso_pdata_charge_tex, pair_args.d_charge, sizeof(Scalar) * pair_args.n_max);
-    }
-
 //! Kernel driver that computes lj forces on the GPU for LJForceComputeGPU
 /*! \param pair_args Other arguments to pass onto the kernel
     \param d_params Parameters for the potential, stored per type pair
@@ -470,13 +437,8 @@ cudaError_t gpu_compute_pair_aniso_forces(const a_pair_args_t& pair_args,
                 case 0:
                     {
                     static unsigned int max_block_size = UINT_MAX;
-                    static unsigned int sm = UINT_MAX;
                     if (max_block_size == UINT_MAX)
                         max_block_size = aniso_get_max_block_size(gpu_compute_pair_aniso_forces_kernel<evaluator, 0, 1>);
-                    if (sm == UINT_MAX)
-                        sm = aniso_get_compute_capability(gpu_compute_pair_aniso_forces_kernel<evaluator, 0, 1>);
-
-                    if (sm < 35) gpu_pair_aniso_force_bind_textures(pair_args);
 
                     block_size = block_size < max_block_size ? block_size : max_block_size;
                     dim3 grid(nwork / (block_size/tpp) + 1, 1, 1);
@@ -507,13 +469,8 @@ cudaError_t gpu_compute_pair_aniso_forces(const a_pair_args_t& pair_args,
                 case 1:
                     {
                     static unsigned int max_block_size = UINT_MAX;
-                    static unsigned int sm = UINT_MAX;
                     if (max_block_size == UINT_MAX)
                         max_block_size = aniso_get_max_block_size(gpu_compute_pair_aniso_forces_kernel<evaluator, 1, 1>);
-                    if (sm == UINT_MAX)
-                        sm = aniso_get_compute_capability(gpu_compute_pair_aniso_forces_kernel<evaluator, 1, 1>);
-
-                    if (sm < 35) gpu_pair_aniso_force_bind_textures(pair_args);
 
                     block_size = block_size < max_block_size ? block_size : max_block_size;
                     dim3 grid(nwork / (block_size/tpp) + 1, 1, 1);
@@ -552,13 +509,9 @@ cudaError_t gpu_compute_pair_aniso_forces(const a_pair_args_t& pair_args,
                 case 0:
                     {
                     static unsigned int max_block_size = UINT_MAX;
-                    static unsigned int sm = UINT_MAX;
                     if (max_block_size == UINT_MAX)
                         max_block_size = aniso_get_max_block_size(gpu_compute_pair_aniso_forces_kernel<evaluator, 0, 0>);
-                    if (sm == UINT_MAX)
-                        sm = aniso_get_compute_capability(gpu_compute_pair_aniso_forces_kernel<evaluator, 0, 0>);
 
-                    if (sm < 35) gpu_pair_aniso_force_bind_textures(pair_args);
 
                     block_size = block_size < max_block_size ? block_size : max_block_size;
                     dim3 grid(nwork / (block_size/tpp) + 1, 1, 1);
@@ -589,21 +542,11 @@ cudaError_t gpu_compute_pair_aniso_forces(const a_pair_args_t& pair_args,
                 case 1:
                     {
                     static unsigned int max_block_size = UINT_MAX;
-                    static unsigned int sm = UINT_MAX;
                     if (max_block_size == UINT_MAX)
                         max_block_size = aniso_get_max_block_size(gpu_compute_pair_aniso_forces_kernel<evaluator, 1, 0>);
-                    if (sm == UINT_MAX)
-                        sm = aniso_get_compute_capability(gpu_compute_pair_aniso_forces_kernel<evaluator, 1, 0>);
-
-                    if (sm < 35) gpu_pair_aniso_force_bind_textures(pair_args);
 
                     block_size = block_size < max_block_size ? block_size : max_block_size;
                     dim3 grid(nwork / (block_size/tpp) + 1, 1, 1);
-                    if (sm < 30 && grid.x > 65535)
-                        {
-                        grid.y = grid.x/65535 + 1;
-                        grid.x = 65535;
-                        }
 
                     shared_bytes += sizeof(Scalar)*block_size;
 
diff --git a/hoomd/md/BondTablePotentialGPU.cc b/hoomd/md/BondTablePotentialGPU.cc
index e56d1f560c..cc3e25ba3e 100644
--- a/hoomd/md/BondTablePotentialGPU.cc
+++ b/hoomd/md/BondTablePotentialGPU.cc
@@ -93,8 +93,7 @@ void BondTablePotentialGPU::computeForces(unsigned int timestep)
                              m_table_width,
                              m_table_value,
                              d_flags.data,
-                             m_tuner->getParam(),
-                             m_exec_conf->getComputeCapability());
+                             m_tuner->getParam());
         }
 
 
diff --git a/hoomd/md/BondTablePotentialGPU.cu b/hoomd/md/BondTablePotentialGPU.cu
index 538f209e72..dcade60489 100644
--- a/hoomd/md/BondTablePotentialGPU.cu
+++ b/hoomd/md/BondTablePotentialGPU.cu
@@ -14,10 +14,6 @@
     \brief Defines GPU kernel code for calculating the table bond forces. Used by BondTablePotentialGPU.
 */
 
-
-//! Texture for reading table values
-scalar2_tex_t tables_tex;
-
 /*!  This kernel is called to calculate the table pair forces on all N particles
 
     \param d_force Device memory to write computed forces
@@ -35,10 +31,6 @@ scalar2_tex_t tables_tex;
     \param d_flags Flag allocated on the device for use in checking for bonds that cannot be evaluated
 
     See BondTablePotential for information on the memory layout.
-
-    \b Details:
-    * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests
-      at a later date may result in this changing.
 */
 __global__ void gpu_compute_bondtable_forces_kernel(Scalar4* d_force,
                                      Scalar* d_virial,
@@ -124,8 +116,8 @@ __global__ void gpu_compute_bondtable_forces_kernel(Scalar4* d_force,
             // compute index into the table and read in values
             unsigned int value_i = floor(value_f);
 
-            Scalar2 VF0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_bond_type));
-            Scalar2 VF1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_bond_type));
+            Scalar2 VF0 = __ldg(d_tables + table_value(value_i, cur_bond_type));
+            Scalar2 VF1 = __ldg(d_tables + table_value(value_i+1, cur_bond_type));
             // unpack the data
             Scalar V0 = VF0.x;
             Scalar V1 = VF1.x;
@@ -190,7 +182,6 @@ __global__ void gpu_compute_bondtable_forces_kernel(Scalar4* d_force,
     \param d_flags flags on the device - a 1 will be written if evaluation
                    of forces failed for any bond
     \param block_size Block size at which to run the kernel
-    \param compute_capability Compute capability of the execution device (200, 3000, 350, ...)
 
     \note This is just a kernel driver. See gpu_compute_bondtable_forces_kernel for full documentation.
 */
@@ -209,8 +200,7 @@ cudaError_t gpu_compute_bondtable_forces(Scalar4* d_force,
                                      const unsigned int table_width,
                                      const Index2D &table_value,
                                      unsigned int *d_flags,
-                                     const unsigned int block_size,
-                                     const unsigned int compute_capability)
+                                     const unsigned int block_size)
     {
     assert(d_params);
     assert(d_tables);
@@ -231,16 +221,6 @@ cudaError_t gpu_compute_bondtable_forces(Scalar4* d_force,
     dim3 grid( N / run_block_size + 1, 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the tables texture only on pre sm 35 arches
-    if (compute_capability < 350)
-        {
-        tables_tex.normalized = false;
-        tables_tex.filterMode = cudaFilterModePoint;
-        cudaError_t error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_value.getNumElements());
-        if (error != cudaSuccess)
-            return error;
-        }
-
     gpu_compute_bondtable_forces_kernel<<< grid, threads, sizeof(Scalar4)*n_bond_type >>>
             (d_force,
              d_virial,
diff --git a/hoomd/md/BondTablePotentialGPU.cuh b/hoomd/md/BondTablePotentialGPU.cuh
index cb238c5729..b85b5bb4b2 100644
--- a/hoomd/md/BondTablePotentialGPU.cuh
+++ b/hoomd/md/BondTablePotentialGPU.cuh
@@ -32,7 +32,6 @@ cudaError_t gpu_compute_bondtable_forces(Scalar4* d_force,
                                      const unsigned int table_width,
                                      const Index2D &table_value,
                                      unsigned int *d_flags,
-                                     const unsigned int block_size,
-                                     const unsigned int compute_capability);
+                                     const unsigned int block_size);
 
 #endif
diff --git a/hoomd/md/CosineSqAngleForceComputeGPU.cc b/hoomd/md/CosineSqAngleForceComputeGPU.cc
index 7213e05ec0..0d5b3f0672 100644
--- a/hoomd/md/CosineSqAngleForceComputeGPU.cc
+++ b/hoomd/md/CosineSqAngleForceComputeGPU.cc
@@ -92,8 +92,7 @@ void CosineSqAngleForceComputeGPU::computeForces(unsigned int timestep)
                                       d_gpu_n_angles.data,
                                       d_params.data,
                                       m_angle_data->getNTypes(),
-                                      m_tuner->getParam(),
-                                      m_exec_conf->getComputeCapability());
+                                      m_tuner->getParam());
 
     if(m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
diff --git a/hoomd/md/CosineSqAngleForceGPU.cu b/hoomd/md/CosineSqAngleForceGPU.cu
index 4e010140b8..1931d6b73d 100644
--- a/hoomd/md/CosineSqAngleForceGPU.cu
+++ b/hoomd/md/CosineSqAngleForceGPU.cu
@@ -16,9 +16,6 @@
     CosineSqAngleForceComputeGPU.
 */
 
-//! Texture for reading angle parameters
-scalar2_tex_t angle_params_tex;
-
 //! Kernel for calculating cosine squared angle forces on the GPU
 /*! \param d_force Device memory to write computed forces
     \param d_virial Device memory to write computed virials
@@ -115,7 +112,7 @@ extern "C" __global__ void gpu_compute_cosinesq_angle_forces_kernel(Scalar4* d_f
         dac = box.minImage(dac);
 
         // get the angle parameters (MEM TRANSFER: 8 bytes)
-        Scalar2 params = texFetchScalar2(d_params, angle_params_tex, cur_angle_type);
+        Scalar2 params = __ldg(d_params + cur_angle_type);
         Scalar K = params.x;
         Scalar t_0 = params.y;
 
@@ -205,7 +202,6 @@ extern "C" __global__ void gpu_compute_cosinesq_angle_forces_kernel(Scalar4* d_f
     \param d_params K and t_0 params packed as Scalar2 variables
     \param n_angle_types Number of angle types in d_params
     \param block_size Block size to use when performing calculations
-    \param compute_capability Device compute capability (200, 300, 350, ...)
 
     \returns Any error code resulting from the kernel launch
     \note Always returns cudaSuccess in release builds to avoid the cudaThreadSynchronize()
@@ -225,8 +221,7 @@ cudaError_t gpu_compute_cosinesq_angle_forces(Scalar4* d_force,
                                               const unsigned int *n_angles_list,
                                               Scalar2 *d_params,
                                               unsigned int n_angle_types,
-                                              int block_size,
-                                              const unsigned int compute_capability)
+                                              int block_size)
     {
     assert(d_params);
 
@@ -244,14 +239,6 @@ cudaError_t gpu_compute_cosinesq_angle_forces(Scalar4* d_force,
     dim3 grid( N / run_block_size + 1, 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the texture on pre sm 35 arches
-    if (compute_capability < 350)
-        {
-        cudaError_t error = cudaBindTexture(0, angle_params_tex, d_params, sizeof(Scalar2) * n_angle_types);
-        if (error != cudaSuccess)
-            return error;
-        }
-
     // run the kernel
     gpu_compute_cosinesq_angle_forces_kernel<<< grid, threads>>>(
             d_force, d_virial, virial_pitch, N, d_pos, d_params, box,
diff --git a/hoomd/md/CosineSqAngleForceGPU.cuh b/hoomd/md/CosineSqAngleForceGPU.cuh
index 05cfd03e4d..48d4895728 100644
--- a/hoomd/md/CosineSqAngleForceGPU.cuh
+++ b/hoomd/md/CosineSqAngleForceGPU.cuh
@@ -28,7 +28,6 @@ cudaError_t gpu_compute_cosinesq_angle_forces(Scalar4* d_force,
                                               const unsigned int *n_angles_list,
                                               Scalar2 *d_params,
                                               unsigned int n_angle_types,
-                                              int block_size,
-                                              const unsigned int compute_capability);
+                                              int block_size);
 
 #endif
diff --git a/hoomd/md/HarmonicAngleForceComputeGPU.cc b/hoomd/md/HarmonicAngleForceComputeGPU.cc
index 79070ae556..891316b0f0 100644
--- a/hoomd/md/HarmonicAngleForceComputeGPU.cc
+++ b/hoomd/md/HarmonicAngleForceComputeGPU.cc
@@ -94,8 +94,7 @@ void HarmonicAngleForceComputeGPU::computeForces(unsigned int timestep)
                                       d_gpu_n_angles.data,
                                       d_params.data,
                                       m_angle_data->getNTypes(),
-                                      m_tuner->getParam(),
-                                      m_exec_conf->getComputeCapability());
+                                      m_tuner->getParam());
 
     if(m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
diff --git a/hoomd/md/HarmonicAngleForceGPU.cu b/hoomd/md/HarmonicAngleForceGPU.cu
index 7f1925721a..abca8c4eb0 100644
--- a/hoomd/md/HarmonicAngleForceGPU.cu
+++ b/hoomd/md/HarmonicAngleForceGPU.cu
@@ -16,9 +16,6 @@
     \brief Defines GPU kernel code for calculating the harmonic angle forces. Used by HarmonicAngleForceComputeGPU.
 */
 
-//! Texture for reading angle parameters
-scalar2_tex_t angle_params_tex;
-
 //! Kernel for calculating harmonic angle forces on the GPU
 /*! \param d_force Device memory to write computed forces
     \param d_virial Device memory to write computed virials
@@ -115,7 +112,7 @@ extern "C" __global__ void gpu_compute_harmonic_angle_forces_kernel(Scalar4* d_f
         dac = box.minImage(dac);
 
         // get the angle parameters (MEM TRANSFER: 8 bytes)
-        Scalar2 params = texFetchScalar2(d_params, angle_params_tex, cur_angle_type);
+        Scalar2 params = __ldg(d_params + cur_angle_type);
         Scalar K = params.x;
         Scalar t_0 = params.y;
 
@@ -227,8 +224,7 @@ cudaError_t gpu_compute_harmonic_angle_forces(Scalar4* d_force,
                                               const unsigned int *n_angles_list,
                                               Scalar2 *d_params,
                                               unsigned int n_angle_types,
-                                              int block_size,
-                                              const unsigned int compute_capability)
+                                              int block_size)
     {
     assert(d_params);
 
@@ -246,14 +242,6 @@ cudaError_t gpu_compute_harmonic_angle_forces(Scalar4* d_force,
     dim3 grid( N / run_block_size + 1, 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the texture on pre sm 35 arches
-    if (compute_capability < 350)
-        {
-        cudaError_t error = cudaBindTexture(0, angle_params_tex, d_params, sizeof(Scalar2) * n_angle_types);
-        if (error != cudaSuccess)
-            return error;
-        }
-
     // run the kernel
     gpu_compute_harmonic_angle_forces_kernel<<< grid, threads>>>(d_force, d_virial, virial_pitch, N, d_pos, d_params, box,
         atable, apos_list, pitch, n_angles_list);
diff --git a/hoomd/md/HarmonicAngleForceGPU.cuh b/hoomd/md/HarmonicAngleForceGPU.cuh
index 49e3f4758f..a18b1eedd9 100644
--- a/hoomd/md/HarmonicAngleForceGPU.cuh
+++ b/hoomd/md/HarmonicAngleForceGPU.cuh
@@ -28,7 +28,6 @@ cudaError_t gpu_compute_harmonic_angle_forces(Scalar4* d_force,
                                               const unsigned int *n_angles_list,
                                               Scalar2 *d_params,
                                               unsigned int n_angle_types,
-                                              int block_size,
-                                              const unsigned int compute_capability);
+                                              int block_size);
 
 #endif
diff --git a/hoomd/md/HarmonicDihedralForceComputeGPU.cc b/hoomd/md/HarmonicDihedralForceComputeGPU.cc
index a9f594156a..13d61eaefb 100644
--- a/hoomd/md/HarmonicDihedralForceComputeGPU.cc
+++ b/hoomd/md/HarmonicDihedralForceComputeGPU.cc
@@ -93,8 +93,7 @@ void HarmonicDihedralForceComputeGPU::computeForces(unsigned int timestep)
                                          d_n_dihedrals.data,
                                          d_params.data,
                                          m_dihedral_data->getNTypes(),
-                                         this->m_tuner->getParam(),
-                                         m_exec_conf->getComputeCapability());
+                                         this->m_tuner->getParam());
     if(m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
     this->m_tuner->end();
diff --git a/hoomd/md/HarmonicDihedralForceGPU.cu b/hoomd/md/HarmonicDihedralForceGPU.cu
index 4c17f64365..0ddd15183b 100644
--- a/hoomd/md/HarmonicDihedralForceGPU.cu
+++ b/hoomd/md/HarmonicDihedralForceGPU.cu
@@ -22,9 +22,6 @@
     \brief Defines GPU kernel code for calculating the harmonic dihedral forces. Used by HarmonicDihedralForceComputeGPU.
 */
 
-//! Texture for reading dihedral parameters
-scalar4_tex_t dihedral_params_tex;
-
 //! Kernel for calculating harmonic dihedral forces on the GPU
 /*! \param d_force Device memory to write computed forces
     \param d_virial Device memory to write computed virials
@@ -137,7 +134,7 @@ void gpu_compute_harmonic_dihedral_forces_kernel(Scalar4* d_force,
         dcbm = box.minImage(dcbm);
 
         // get the dihedral parameters (MEM TRANSFER: 12 bytes)
-        Scalar4 params = texFetchScalar4(d_params, dihedral_params_tex, cur_dihedral_type);
+        Scalar4 params = __ldg(d_params + cur_dihedral_type);
         Scalar K = params.x;
         Scalar sign = params.y;
         Scalar multi = params.z;
@@ -319,8 +316,7 @@ cudaError_t gpu_compute_harmonic_dihedral_forces(Scalar4* d_force,
                                                  const unsigned int *n_dihedrals_list,
                                                  Scalar4 *d_params,
                                                  unsigned int n_dihedral_types,
-                                                 int block_size,
-                                                 const unsigned int compute_capability)
+                                                 int block_size)
     {
     assert(d_params);
 
@@ -338,14 +334,6 @@ cudaError_t gpu_compute_harmonic_dihedral_forces(Scalar4* d_force,
     dim3 grid( N / run_block_size + 1, 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the texture on pre sm35 devices
-    if (compute_capability < 350)
-        {
-        cudaError_t error = cudaBindTexture(0, dihedral_params_tex, d_params, sizeof(Scalar4) * n_dihedral_types);
-        if (error != cudaSuccess)
-            return error;
-        }
-
     // run the kernel
     gpu_compute_harmonic_dihedral_forces_kernel<<< grid, threads>>>(d_force, d_virial, virial_pitch, N, d_pos, d_params, box, tlist, dihedral_ABCD, pitch, n_dihedrals_list);
 
diff --git a/hoomd/md/HarmonicDihedralForceGPU.cuh b/hoomd/md/HarmonicDihedralForceGPU.cuh
index a375c1507f..60276df35f 100644
--- a/hoomd/md/HarmonicDihedralForceGPU.cuh
+++ b/hoomd/md/HarmonicDihedralForceGPU.cuh
@@ -28,7 +28,6 @@ cudaError_t gpu_compute_harmonic_dihedral_forces(Scalar4* d_force,
                                                  const unsigned int *n_dihedrals_list,
                                                  Scalar4 *d_params,
                                                  unsigned int n_dihedral_types,
-                                                 int block_size,
-                                                 const unsigned int compute_capability);
+                                                 int block_size);
 
 #endif
diff --git a/hoomd/md/HarmonicImproperForceComputeGPU.cc b/hoomd/md/HarmonicImproperForceComputeGPU.cc
index a27b88cf7a..ef768358e2 100644
--- a/hoomd/md/HarmonicImproperForceComputeGPU.cc
+++ b/hoomd/md/HarmonicImproperForceComputeGPU.cc
@@ -91,8 +91,7 @@ void HarmonicImproperForceComputeGPU::computeForces(unsigned int timestep)
                                          d_n_dihedrals.data,
                                          d_params.data,
                                          m_improper_data->getNTypes(),
-                                         m_tuner->getParam(),
-                                         m_exec_conf->getComputeCapability());
+                                         m_tuner->getParam());
     if(m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
     m_tuner->end();
diff --git a/hoomd/md/HarmonicImproperForceGPU.cu b/hoomd/md/HarmonicImproperForceGPU.cu
index 969e0c7da8..89c896e706 100644
--- a/hoomd/md/HarmonicImproperForceGPU.cu
+++ b/hoomd/md/HarmonicImproperForceGPU.cu
@@ -16,9 +16,6 @@
     \brief Defines GPU kernel code for calculating the harmonic improper forces. Used by HarmonicImproperForceComputeGPU.
 */
 
-//! Texture for reading improper parameters
-scalar2_tex_t improper_params_tex;
-
 //! Kernel for calculating harmonic improper forces on the GPU
 /*! \param d_force Device memory to write computed forces
     \param d_virial Device memory to write computed virials
@@ -129,7 +126,7 @@ void gpu_compute_harmonic_improper_forces_kernel(Scalar4* d_force,
         ddc = box.minImage(ddc);
 
         // get the improper parameters (MEM TRANSFER: 12 bytes)
-        Scalar2 params = texFetchScalar2(d_params, improper_params_tex, cur_improper_type);
+        Scalar2 params = __ldg(d_params + cur_improper_type);
         Scalar K = params.x;
         Scalar chi = params.y;
 
@@ -281,8 +278,7 @@ cudaError_t gpu_compute_harmonic_improper_forces(Scalar4* d_force,
                                                  const unsigned int *n_dihedrals_list,
                                                  Scalar2 *d_params,
                                                  unsigned int n_improper_types,
-                                                 int block_size,
-                                                 const unsigned int compute_capability)
+                                                 int block_size)
     {
     assert(d_params);
 
@@ -303,14 +299,6 @@ cudaError_t gpu_compute_harmonic_improper_forces(Scalar4* d_force,
     dim3 grid( N / run_block_size + 1, 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the texture on pre sm35 devices
-    if (compute_capability < 350)
-        {
-        cudaError_t error = cudaBindTexture(0, improper_params_tex, d_params, sizeof(Scalar2) * n_improper_types);
-        if (error != cudaSuccess)
-            return error;
-        }
-
     // run the kernel
     gpu_compute_harmonic_improper_forces_kernel<<< grid, threads>>>(d_force, d_virial, virial_pitch, N, d_pos, d_params, box, tlist, dihedral_ABCD, pitch, n_dihedrals_list);
 
diff --git a/hoomd/md/HarmonicImproperForceGPU.cuh b/hoomd/md/HarmonicImproperForceGPU.cuh
index 7c4dc4fdb1..2a248bf75a 100644
--- a/hoomd/md/HarmonicImproperForceGPU.cuh
+++ b/hoomd/md/HarmonicImproperForceGPU.cuh
@@ -28,7 +28,6 @@ cudaError_t gpu_compute_harmonic_improper_forces(Scalar4* d_force,
                                                  const unsigned int *n_dihedrals_list,
                                                  Scalar2 *d_params,
                                                  unsigned int n_improper_types,
-                                                 int block_size,
-                                                 const unsigned int compute_capability);
+                                                 int block_size);
 
 #endif
diff --git a/hoomd/md/NeighborListGPUBinned.cu b/hoomd/md/NeighborListGPUBinned.cu
index f33c5f8cc3..de3f6e799a 100644
--- a/hoomd/md/NeighborListGPUBinned.cu
+++ b/hoomd/md/NeighborListGPUBinned.cu
@@ -12,9 +12,6 @@
     \brief Defines GPU kernel code for O(N) neighbor list generation on the GPU
 */
 
-//! Texture for reading d_cell_xyzf
-scalar4_tex_t cell_xyzf_1d_tex;
-
 //! Kernel call for generating neighbor list on the GPU (Kepler optimized version)
 /*! \tparam flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering.
     \param d_nlist Neighbor list data structure to write
@@ -196,7 +193,7 @@ __global__ void gpu_compute_nlist_binned_kernel(unsigned int *d_nlist,
             unsigned int j;
             Scalar4 postype_j;
             if (!use_index)
-                cur_xyzf = texFetchScalar4(d_cell_xyzf, cell_xyzf_1d_tex, cli(cur_offset, neigh_cell));
+                cur_xyzf = __ldg(d_cell_xyzf + cli(cur_offset, neigh_cell));
             else
                 {
                 j = d_cell_idx[cli(cur_offset, neigh_cell)+igpu*cli.getNumElements()];
@@ -300,14 +297,6 @@ int get_max_block_size(T func)
     return max_threads;
     }
 
-void gpu_nlist_binned_bind_texture(const Scalar4 *d_cell_xyzf, unsigned int n_elements)
-    {
-    // bind the position texture
-    cell_xyzf_1d_tex.normalized = false;
-    cell_xyzf_1d_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, cell_xyzf_1d_tex, d_cell_xyzf, sizeof(Scalar4)*n_elements);
-    }
-
 //! recursive template to launch neighborlist with given template parameters
 /* \tparam cur_tpp Number of threads per particle (assumed to be power of two) */
 template<int cur_tpp>
@@ -359,7 +348,6 @@ inline void launcher(unsigned int *d_nlist,
                 static unsigned int max_block_size = UINT_MAX;
                 if (max_block_size == UINT_MAX)
                     max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<0,0,cur_tpp>);
-                if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 
                 block_size = block_size < max_block_size ? block_size : max_block_size;
                 dim3 grid(nwork / (block_size/tpp) + 1);
@@ -396,7 +384,6 @@ inline void launcher(unsigned int *d_nlist,
                 static unsigned int max_block_size = UINT_MAX;
                 if (max_block_size == UINT_MAX)
                     max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<1,0,cur_tpp>);
-                if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 
                 block_size = block_size < max_block_size ? block_size : max_block_size;
                 dim3 grid(nwork / (block_size/tpp) + 1);
@@ -433,7 +420,6 @@ inline void launcher(unsigned int *d_nlist,
                 static unsigned int max_block_size = UINT_MAX;
                 if (max_block_size == UINT_MAX)
                     max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<2,0,cur_tpp>);
-                if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 
                 block_size = block_size < max_block_size ? block_size : max_block_size;
                 dim3 grid(nwork / (block_size/tpp) + 1);
@@ -470,7 +456,6 @@ inline void launcher(unsigned int *d_nlist,
                 static unsigned int max_block_size = UINT_MAX;
                 if (max_block_size == UINT_MAX)
                     max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<3,0,cur_tpp>);
-                if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 
                 block_size = block_size < max_block_size ? block_size : max_block_size;
                 dim3 grid(nwork / (block_size/tpp) + 1);
@@ -510,7 +495,6 @@ inline void launcher(unsigned int *d_nlist,
                 static unsigned int max_block_size = UINT_MAX;
                 if (max_block_size == UINT_MAX)
                     max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<0,1,cur_tpp>);
-                if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 
                 block_size = block_size < max_block_size ? block_size : max_block_size;
                 dim3 grid(nwork / (block_size/tpp) + 1);
@@ -547,7 +531,6 @@ inline void launcher(unsigned int *d_nlist,
                 static unsigned int max_block_size = UINT_MAX;
                 if (max_block_size == UINT_MAX)
                     max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<1,1,cur_tpp>);
-                if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 
                 block_size = block_size < max_block_size ? block_size : max_block_size;
                 dim3 grid(nwork / (block_size/tpp) + 1);
@@ -584,7 +567,6 @@ inline void launcher(unsigned int *d_nlist,
                 static unsigned int max_block_size = UINT_MAX;
                 if (max_block_size == UINT_MAX)
                     max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<2,1,cur_tpp>);
-                if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 
                 block_size = block_size < max_block_size ? block_size : max_block_size;
                 dim3 grid(nwork / (block_size/tpp) + 1);
@@ -621,7 +603,6 @@ inline void launcher(unsigned int *d_nlist,
                 static unsigned int max_block_size = UINT_MAX;
                 if (max_block_size == UINT_MAX)
                     max_block_size = get_max_block_size(gpu_compute_nlist_binned_kernel<3,1,cur_tpp>);
-                if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 
                 block_size = block_size < max_block_size ? block_size : max_block_size;
                 dim3 grid(nwork / (block_size/tpp) + 1);
diff --git a/hoomd/md/NeighborListGPUStencil.cc b/hoomd/md/NeighborListGPUStencil.cc
index c9a06f5ce9..7413326aae 100644
--- a/hoomd/md/NeighborListGPUStencil.cc
+++ b/hoomd/md/NeighborListGPUStencil.cc
@@ -312,8 +312,7 @@ void NeighborListGPUStencil::buildNlist(unsigned int timestep)
                               m_filter_body,
                               m_diameter_shift,
                               threads_per_particle,
-                              block_size,
-                              m_exec_conf->getComputeCapability()/10);
+                              block_size);
 
     if(m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
     if (tune) this->m_tuner->end();
diff --git a/hoomd/md/NeighborListGPUStencil.cu b/hoomd/md/NeighborListGPUStencil.cu
index 1fb9318c74..48e037c655 100644
--- a/hoomd/md/NeighborListGPUStencil.cu
+++ b/hoomd/md/NeighborListGPUStencil.cu
@@ -13,15 +13,6 @@
     \brief Defines GPU kernel code for O(N) neighbor list generation on the GPU with multiple bin stencils
 */
 
-//! Texture for reading d_cell_xyzf
-scalar4_tex_t cell_xyzf_1d_tex;
-
-//! Texture for reading d_cell_tdb
-scalar4_tex_t cell_tdb_1d_tex;
-
-//! Texture for reading d_stencil
-scalar4_tex_t stencil_1d_tex;
-
 //! Kernel call for generating neighbor list on the GPU using multiple stencils (Kepler optimized version)
 /*! \tparam flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering.
     \tparam threads_per_particle Number of threads cooperatively computing the neighbor list
@@ -179,7 +170,7 @@ __global__ void gpu_compute_nlist_stencil_kernel(unsigned int *d_nlist,
             if (cur_adj < n_stencil)
                 {
                 // compute the stenciled cell cartesian coordinates
-                Scalar4 stencil = texFetchScalar4(d_stencil, stencil_1d_tex, stencil_idx(cur_adj, my_type));
+                Scalar4 stencil = __ldg(d_stencil + stencil_idx(cur_adj, my_type));
                 int sib = ib + __scalar_as_int(stencil.x);
                 int sjb = jb + __scalar_as_int(stencil.y);
                 int skb = kb + __scalar_as_int(stencil.z);
@@ -212,7 +203,7 @@ __global__ void gpu_compute_nlist_stencil_kernel(unsigned int *d_nlist,
             do
                 {
                 // read in the particle type (diameter and body as well while we've got the Scalar4 in)
-                const Scalar4 neigh_tdb = texFetchScalar4(d_cell_tdb, cell_tdb_1d_tex, cli(cur_offset, neigh_cell));
+                const Scalar4 neigh_tdb = __ldg(d_cell_tdb + cli(cur_offset, neigh_cell));
                 const unsigned int type_j = __scalar_as_int(neigh_tdb.x);
                 const Scalar diam_j = neigh_tdb.y;
                 const unsigned int body_j = __scalar_as_int(neigh_tdb.z);
@@ -237,7 +228,7 @@ __global__ void gpu_compute_nlist_stencil_kernel(unsigned int *d_nlist,
                 if (cell_dist2 > r_listsq) break;
 
                 // only load in the particle position and id if distance check is required
-                const Scalar4 neigh_xyzf = texFetchScalar4(d_cell_xyzf, cell_xyzf_1d_tex, cli(cur_offset, neigh_cell));
+                const Scalar4 neigh_xyzf = __ldg(d_cell_xyzf + cli(cur_offset, neigh_cell));
                 const Scalar3 neigh_pos = make_scalar3(neigh_xyzf.x, neigh_xyzf.y, neigh_xyzf.z);
                 unsigned int cur_neigh = __scalar_as_int(neigh_xyzf.w);
 
@@ -301,36 +292,6 @@ int get_max_block_size_stencil(T func)
     return max_threads;
     }
 
-//! Bind the textures on sm <= 30
-/*!
- * \param d_cell_xyzf Cell list particle array
- * \param d_cell_tdb Cell list type-diameter-body array
- * \param n_elements Number of elements in the cell list arrays
- * \param d_stencil Stencil offset array
- * \param n_stencil_elements Number of elements in the stencil offset array
- */
-void gpu_nlist_stencil_bind_texture(const Scalar4 *d_cell_xyzf,
-                                    const Scalar4 *d_cell_tdb,
-                                    unsigned int n_elements,
-                                    const Scalar4 *d_stencil,
-                                    unsigned int n_stencil_elements)
-    {
-    // bind the position texture
-    cell_xyzf_1d_tex.normalized = false;
-    cell_xyzf_1d_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, cell_xyzf_1d_tex, d_cell_xyzf, sizeof(Scalar4)*n_elements);
-
-    // bind the position texture
-    cell_tdb_1d_tex.normalized = false;
-    cell_tdb_1d_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, cell_tdb_1d_tex, d_cell_tdb, sizeof(Scalar4)*n_elements);
-
-    // bind the stencil texture
-    stencil_1d_tex.normalized = false;
-    stencil_1d_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, stencil_1d_tex, d_stencil, sizeof(Scalar4)*n_stencil_elements);
-    }
-
 //! recursive template to launch neighborlist with given template parameters
 /* \tparam cur_tpp Number of threads per particle (assumed to be power of two) */
 template<int cur_tpp>
@@ -361,8 +322,7 @@ inline void stencil_launcher(unsigned int *d_nlist,
                              bool filter_body,
                              bool diameter_shift,
                              const unsigned int threads_per_particle,
-                             const unsigned int block_size,
-                             const unsigned int compute_capability)
+                             const unsigned int block_size)
     {
     // shared memory = r_listsq + Nmax + stuff needed for neighborlist (computed below)
     Index2D typpair_idx(ntypes);
@@ -375,11 +335,6 @@ inline void stencil_launcher(unsigned int *d_nlist,
             static unsigned int max_block_size = UINT_MAX;
             if (max_block_size == UINT_MAX)
                 max_block_size = get_max_block_size_stencil(gpu_compute_nlist_stencil_kernel<0,cur_tpp>);
-            if (compute_capability < 35) gpu_nlist_stencil_bind_texture(d_cell_xyzf,
-                                                                        d_cell_tdb,
-                                                                        cli.getNumElements(),
-                                                                        d_stencil,
-                                                                        stencil_idx.getNumElements());
 
             unsigned int run_block_size = (block_size < max_block_size) ? block_size : max_block_size;
             dim3 grid(N / (block_size/threads_per_particle) + 1);
@@ -413,11 +368,6 @@ inline void stencil_launcher(unsigned int *d_nlist,
             static unsigned int max_block_size = UINT_MAX;
             if (max_block_size == UINT_MAX)
                 max_block_size = get_max_block_size_stencil(gpu_compute_nlist_stencil_kernel<1,cur_tpp>);
-            if (compute_capability < 35) gpu_nlist_stencil_bind_texture(d_cell_xyzf,
-                                                                        d_cell_tdb,
-                                                                        cli.getNumElements(),
-                                                                        d_stencil,
-                                                                        stencil_idx.getNumElements());
 
             unsigned int run_block_size = (block_size < max_block_size) ? block_size : max_block_size;
             dim3 grid(N / (block_size/threads_per_particle) + 1);
@@ -451,11 +401,6 @@ inline void stencil_launcher(unsigned int *d_nlist,
             static unsigned int max_block_size = UINT_MAX;
             if (max_block_size == UINT_MAX)
                 max_block_size = get_max_block_size_stencil(gpu_compute_nlist_stencil_kernel<2,cur_tpp>);
-            if (compute_capability < 35) gpu_nlist_stencil_bind_texture(d_cell_xyzf,
-                                                                        d_cell_tdb,
-                                                                        cli.getNumElements(),
-                                                                        d_stencil,
-                                                                        stencil_idx.getNumElements());
 
             unsigned int run_block_size = (block_size < max_block_size) ? block_size : max_block_size;
             dim3 grid(N / (block_size/threads_per_particle) + 1);
@@ -489,11 +434,6 @@ inline void stencil_launcher(unsigned int *d_nlist,
             static unsigned int max_block_size = UINT_MAX;
             if (max_block_size == UINT_MAX)
                 max_block_size = get_max_block_size_stencil(gpu_compute_nlist_stencil_kernel<3,cur_tpp>);
-            if (compute_capability < 35) gpu_nlist_stencil_bind_texture(d_cell_xyzf,
-                                                                        d_cell_tdb,
-                                                                        cli.getNumElements(),
-                                                                        d_stencil,
-                                                                        stencil_idx.getNumElements());
 
             unsigned int run_block_size = (block_size < max_block_size) ? block_size : max_block_size;
             dim3 grid(N / (block_size/threads_per_particle) + 1);
@@ -552,8 +492,7 @@ inline void stencil_launcher(unsigned int *d_nlist,
                                     filter_body,
                                     diameter_shift,
                                     threads_per_particle,
-                                    block_size,
-                                    compute_capability);
+                                    block_size);
         }
     }
 
@@ -586,8 +525,7 @@ inline void stencil_launcher<min_threads_per_particle/2>(unsigned int *d_nlist,
                                                          bool filter_body,
                                                          bool diameter_shift,
                                                          const unsigned int threads_per_particle,
-                                                         const unsigned int block_size,
-                                                         const unsigned int compute_capability)
+                                                         const unsigned int block_size)
     { }
 
 cudaError_t gpu_compute_nlist_stencil(unsigned int *d_nlist,
@@ -617,8 +555,7 @@ cudaError_t gpu_compute_nlist_stencil(unsigned int *d_nlist,
                                       bool filter_body,
                                       bool diameter_shift,
                                       const unsigned int threads_per_particle,
-                                      const unsigned int block_size,
-                                      const unsigned int compute_capability)
+                                      const unsigned int block_size)
     {
     stencil_launcher<max_threads_per_particle>(d_nlist,
                                                d_n_neigh,
@@ -647,8 +584,7 @@ cudaError_t gpu_compute_nlist_stencil(unsigned int *d_nlist,
                                                filter_body,
                                                diameter_shift,
                                                threads_per_particle,
-                                               block_size,
-                                               compute_capability);
+                                               block_size);
     return cudaSuccess;
     }
 
diff --git a/hoomd/md/NeighborListGPUStencil.cuh b/hoomd/md/NeighborListGPUStencil.cuh
index 8344c3f20f..63b9387973 100644
--- a/hoomd/md/NeighborListGPUStencil.cuh
+++ b/hoomd/md/NeighborListGPUStencil.cuh
@@ -49,8 +49,7 @@ cudaError_t gpu_compute_nlist_stencil(unsigned int *d_nlist,
                                       bool filter_body,
                                       bool diameter_shift,
                                       const unsigned int threads_per_particle,
-                                      const unsigned int block_size,
-                                      const unsigned int compute_capability);
+                                      const unsigned int block_size);
 
 //! Kernel driver for filling the particle types for sorting
 cudaError_t gpu_compute_nlist_stencil_fill_types(unsigned int *d_pids,
diff --git a/hoomd/md/NeighborListGPUTree.cc b/hoomd/md/NeighborListGPUTree.cc
index 774b716a10..6c6175f369 100644
--- a/hoomd/md/NeighborListGPUTree.cc
+++ b/hoomd/md/NeighborListGPUTree.cc
@@ -809,7 +809,6 @@ void NeighborListGPUTree::traverseTree()
                             m_pdata->getNTypes(),
                             m_filter_body,
                             m_diameter_shift,
-                            m_exec_conf->getComputeCapability()/10,
                             m_tuner_traverse->getParam());
     if (m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
diff --git a/hoomd/md/NeighborListGPUTree.cu b/hoomd/md/NeighborListGPUTree.cu
index 291f8620de..7c244f50ea 100644
--- a/hoomd/md/NeighborListGPUTree.cu
+++ b/hoomd/md/NeighborListGPUTree.cu
@@ -16,18 +16,6 @@
     \brief Defines GPU kernel code for neighbor list tree traversal on the GPU
 */
 
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-//! Texture for reading leaf data
-scalar4_tex_t leaf_xyzf_tex;
-//! Texture for the diameter / body
-scalar2_tex_t leaf_db_tex;
-//! Texture for reading node upper and lower bounds
-scalar4_tex_t aabb_node_bounds_tex;
-//! Texture for the head list
-texture<unsigned int, 1, cudaReadModeElementType> head_list_tex;
-
-
 //!< Expands a 10-bit integer into 30 bits by inserting 2 zeros after each bit.
 /*!
  * \param v unsigned integer with 10 bits set
@@ -1025,16 +1013,16 @@ __global__ void gpu_nlist_traverse_tree_kernel(unsigned int *d_nlist,
     if (my_pidx >= N)
         return;
 
-    const Scalar4 postype_i = texFetchScalar4(d_pos, pdata_pos_tex, my_pidx);
+    const Scalar4 postype_i = __ldg(d_pos + my_pidx);
     const Scalar3 pos_i = make_scalar3(postype_i.x, postype_i.y, postype_i.z);
     const unsigned int type_i = __scalar_as_int(postype_i.w);
 
     // fetch the diameter and body out of the leaf texture since it's bound anyway
-    const Scalar2 db_i = texFetchScalar2(d_leaf_db, leaf_db_tex, idx);
+    const Scalar2 db_i = __ldg(d_leaf_db + idx);
     const Scalar diam_i = db_i.x;
     const unsigned int body_i = __scalar_as_int(db_i.y);
 
-    const unsigned int nlist_head_i = texFetchUint(d_head_list, head_list_tex, my_pidx);
+    const unsigned int nlist_head_i = __ldg(d_head_list + my_pidx);
 
     unsigned int n_neigh_i = 0;
     for (unsigned int cur_pair_type=0; cur_pair_type < ntypes; ++cur_pair_type)
@@ -1073,8 +1061,8 @@ __global__ void gpu_nlist_traverse_tree_kernel(unsigned int *d_nlist,
             int cur_node_idx = cur_tree_root;
             while (cur_node_idx > -1)
                 {
-                const Scalar4 upper_rope = texFetchScalar4(d_tree_aabbs, aabb_node_bounds_tex, 2*cur_node_idx);
-                const Scalar4 lower_np = texFetchScalar4(d_tree_aabbs, aabb_node_bounds_tex, 2*cur_node_idx+1);
+                const Scalar4 upper_rope = __ldg(d_tree_aabbs + 2*cur_node_idx);
+                const Scalar4 lower_np = __ldg(d_tree_aabbs + 2*cur_node_idx+1);
 
                 if (!(aabb_upper.x < lower_np.x
                       || aabb_lower.x > upper_rope.x
@@ -1094,11 +1082,11 @@ __global__ void gpu_nlist_traverse_tree_kernel(unsigned int *d_nlist,
                         for (unsigned int cur_p = node_head; cur_p < node_head + n_part; ++cur_p)
                             {
                             // neighbor j
-                            const Scalar4 cur_xyzf = texFetchScalar4(d_leaf_xyzf, leaf_xyzf_tex, cur_p);
+                            const Scalar4 cur_xyzf = __ldg(d_leaf_xyzf + cur_p);
                             const Scalar3 pos_j = make_scalar3(cur_xyzf.x, cur_xyzf.y, cur_xyzf.z);
                             const unsigned int j = __scalar_as_int(cur_xyzf.w);
 
-                            const Scalar2 cur_db = texFetchScalar2(d_leaf_db, leaf_db_tex, cur_p);
+                            const Scalar2 cur_db = __ldg(d_leaf_db + cur_p);
                             const Scalar diam_j = cur_db.x;
                             const unsigned int body_j = __scalar_as_int(cur_db.y);
 
@@ -1223,47 +1211,12 @@ cudaError_t gpu_nlist_traverse_tree(unsigned int *d_nlist,
                                     const unsigned int ntypes,
                                     bool filter_body,
                                     bool diameter_shift,
-                                    const unsigned int compute_capability,
                                     const unsigned int block_size)
     {
     // shared memory = r_list + Nmax
     Index2D typpair_idx(ntypes);
     unsigned int shared_size = sizeof(Scalar)*typpair_idx.getNumElements() + 2*sizeof(unsigned int)*ntypes;
 
-    // bind the neighborlist texture
-    if (compute_capability < 35)
-        {
-        pdata_pos_tex.normalized = false;
-        pdata_pos_tex.filterMode = cudaFilterModePoint;
-        cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*(N+nghosts));
-        if (error != cudaSuccess)
-            return error;
-
-        leaf_xyzf_tex.normalized = false;
-        leaf_xyzf_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, leaf_xyzf_tex, d_leaf_xyzf, sizeof(Scalar4)*(N+nghosts));
-        if (error != cudaSuccess)
-            return error;
-
-        leaf_db_tex.normalized = false;
-        leaf_db_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, leaf_db_tex, d_leaf_db, sizeof(Scalar2)*(N+nghosts));
-        if (error != cudaSuccess)
-            return error;
-
-        aabb_node_bounds_tex.normalized = false;
-        aabb_node_bounds_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, aabb_node_bounds_tex, d_tree_aabbs, sizeof(Scalar4)*2*nnodes);
-        if (error != cudaSuccess)
-            return error;
-
-        head_list_tex.normalized = false;
-        head_list_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, head_list_tex, d_head_list, sizeof(unsigned int)*N);
-        if (error != cudaSuccess)
-            return error;
-        }
-
     if (!filter_body && !diameter_shift)
         {
         static unsigned int max_block_size = UINT_MAX;
@@ -1405,30 +1358,6 @@ cudaError_t gpu_nlist_traverse_tree(unsigned int *d_nlist,
                                                                                     ntypes);
         }
 
-    // unbind the textures
-    if (compute_capability < 35)
-        {
-        cudaError_t error = cudaUnbindTexture(pdata_pos_tex);
-        if (error != cudaSuccess)
-            return error;
-
-        error = cudaUnbindTexture(leaf_xyzf_tex);
-        if (error != cudaSuccess)
-            return error;
-
-        error = cudaUnbindTexture(leaf_db_tex);
-        if (error != cudaSuccess)
-            return error;
-
-        error = cudaUnbindTexture(aabb_node_bounds_tex);
-        if (error != cudaSuccess)
-            return error;
-
-        error = cudaUnbindTexture(head_list_tex);
-        if (error != cudaSuccess)
-            return error;
-        }
-
     return cudaSuccess;
     }
 
diff --git a/hoomd/md/NeighborListGPUTree.cuh b/hoomd/md/NeighborListGPUTree.cuh
index 0b2ed903cf..b4c4b011ca 100644
--- a/hoomd/md/NeighborListGPUTree.cuh
+++ b/hoomd/md/NeighborListGPUTree.cuh
@@ -120,7 +120,6 @@ cudaError_t gpu_nlist_traverse_tree(unsigned int *d_nlist,
                                     const unsigned int ntypes,
                                     bool filter_body,
                                     bool diameter_shift,
-                                    const unsigned int compute_capability,
                                     const unsigned int block_size);
 
 //! Kernel driver to initialize counting for types and nodes
diff --git a/hoomd/md/OPLSDihedralForceComputeGPU.cc b/hoomd/md/OPLSDihedralForceComputeGPU.cc
index 8fe6994653..33f2668cd8 100644
--- a/hoomd/md/OPLSDihedralForceComputeGPU.cc
+++ b/hoomd/md/OPLSDihedralForceComputeGPU.cc
@@ -69,8 +69,7 @@ void OPLSDihedralForceComputeGPU::computeForces(unsigned int timestep)
                                          d_n_dihedrals.data,
                                          d_params.data,
                                          m_dihedral_data->getNTypes(),
-                                         this->m_tuner->getParam(),
-                                         m_exec_conf->getComputeCapability());
+                                         this->m_tuner->getParam());
     if(m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
     this->m_tuner->end();
diff --git a/hoomd/md/OPLSDihedralForceGPU.cu b/hoomd/md/OPLSDihedralForceGPU.cu
index c6fc68ec7d..307d47422a 100644
--- a/hoomd/md/OPLSDihedralForceGPU.cu
+++ b/hoomd/md/OPLSDihedralForceGPU.cu
@@ -13,9 +13,6 @@
     \brief Defines GPU kernel code for calculating OPLS dihedral forces. Used by OPLSDihedralForceComputeGPU.
 */
 
-//! Texture for reading dihedral parameters
-scalar4_tex_t dihedral_params_tex;
-
 //! Kernel for calculating OPLS dihedral forces on the GPU
 /*! \param d_force Device memory to write computed forces
     \param d_virial Device memory to write computed virials
@@ -159,7 +156,7 @@ void gpu_compute_opls_dihedral_forces_kernel(Scalar4* d_force,
 
         // get values for k1/2 through k4/2 (MEM TRANSFER: 16 bytes)
         // ----- The 1/2 factor is already stored in the parameters --------
-        Scalar4 params = texFetchScalar4(d_params, dihedral_params_tex, cur_dihedral_type);
+        Scalar4 params = __ldg(d_params + cur_dihedral_type);
         Scalar k1 = params.x;
         Scalar k2 = params.y;
         Scalar k3 = params.z;
@@ -321,8 +318,7 @@ cudaError_t gpu_compute_opls_dihedral_forces(Scalar4* d_force,
                                                 const unsigned int *n_dihedrals_list,
                                                 const Scalar4 *d_params,
                                                 const unsigned int n_dihedral_types,
-                                                const int block_size,
-                                                const unsigned int compute_capability)
+                                                const int block_size)
     {
     assert(d_params);
 
@@ -340,14 +336,6 @@ cudaError_t gpu_compute_opls_dihedral_forces(Scalar4* d_force,
     dim3 grid( N / run_block_size + 1, 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the texture on pre sm35 devices
-    if (compute_capability < 350)
-        {
-        cudaError_t error = cudaBindTexture(0, dihedral_params_tex, d_params, sizeof(Scalar4) * n_dihedral_types);
-        if (error != cudaSuccess)
-            return error;
-        }
-
     // run the kernel
     gpu_compute_opls_dihedral_forces_kernel<<< grid, threads>>>(d_force, d_virial, virial_pitch, N, d_pos, d_params,
                                                                 box, tlist, dihedral_ABCD, pitch, n_dihedrals_list);
diff --git a/hoomd/md/OPLSDihedralForceGPU.cuh b/hoomd/md/OPLSDihedralForceGPU.cuh
index bca16470b6..b466768019 100644
--- a/hoomd/md/OPLSDihedralForceGPU.cuh
+++ b/hoomd/md/OPLSDihedralForceGPU.cuh
@@ -28,7 +28,6 @@ cudaError_t gpu_compute_opls_dihedral_forces(Scalar4* d_force,
                                                 const unsigned int *n_dihedrals_list,
                                                 const Scalar4 *d_params,
                                                 const unsigned int n_dihedral_types,
-                                                const int block_size,
-                                                const unsigned int compute_capability);
+                                                const int block_size);
 
 #endif
diff --git a/hoomd/md/PPPMForceComputeGPU.cc b/hoomd/md/PPPMForceComputeGPU.cc
index df6badcc2b..33b2b7922f 100644
--- a/hoomd/md/PPPMForceComputeGPU.cc
+++ b/hoomd/md/PPPMForceComputeGPU.cc
@@ -186,7 +186,7 @@ void PPPMForceComputeGPU::initializeFFT()
         cudaMemPrefetchAsync(m_fourier_mesh_G_z.get(), m_n_inner_cells*sizeof(cufftComplex), gpu_map[0]);
         cudaMemAdvise(m_fourier_mesh_G_z.get(), m_n_inner_cells*sizeof(cufftComplex), cudaMemAdviseSetPreferredLocation, gpu_map[0]);
         cudaMemPrefetchAsync(m_fourier_mesh_G_y.get(), m_n_inner_cells*sizeof(cufftComplex), gpu_map[0]);
-        } 
+        }
 
     unsigned int n_blocks = (m_mesh_points.x*m_mesh_points.y*m_mesh_points.z)/m_block_size+1;
     GlobalArray<Scalar> sum_partial(n_blocks,m_exec_conf);
@@ -686,8 +686,7 @@ void PPPMForceComputeGPU::fixExclusions()
                    m_alpha,
                    d_index_array.data,
                    group_size,
-                   m_block_size,
-                   m_exec_conf->getComputeCapability());
+                   m_block_size);
 
     if(m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
diff --git a/hoomd/md/PPPMForceComputeGPU.cu b/hoomd/md/PPPMForceComputeGPU.cu
index 06f6e22dfc..482383e6c2 100644
--- a/hoomd/md/PPPMForceComputeGPU.cu
+++ b/hoomd/md/PPPMForceComputeGPU.cu
@@ -16,23 +16,6 @@
 //! The developer has chosen not to document this variable
 __device__ __constant__ Scalar GPU_rho_coeff[CONSTANT_SIZE];
 
-//! Implements workaround atomic float addition on sm_1x hardware
-__device__ inline void atomicFloatAdd(float* address, float value)
-    {
-#if (__CUDA_ARCH__ < 200)
-    float old = value;
-    float new_old;
-    do
-        {
-        new_old = atomicExch(address, 0.0f);
-        new_old += old;
-        }
-    while ((old = atomicExch(address, new_old))!=0.0f);
-#else
-    atomicAdd(address, value);
-#endif
-    }
-
 //! GPU implementation of sinc(x)==sin(x)/x
 __device__ Scalar gpu_sinc(Scalar x)
     {
@@ -257,7 +240,7 @@ __global__ void gpu_assign_particles_kernel(const uint3 mesh_dim,
 
                     // compute fraction of particle density assigned to cell
                     // from particles in this bin
-                    atomicFloatAdd(&d_mesh[cell_idx].x, z0*result);
+                    atomicAdd(&d_mesh[cell_idx].x, z0*result);
                     }
 
                 ignore_z = false;
@@ -333,7 +316,7 @@ void gpu_assign_particles(const uint3 mesh_dim,
 
         if (ngpu > 1)
             {
-            // zero the temporary mesh array 
+            // zero the temporary mesh array
             cudaMemsetAsync(d_mesh_scratch + idev*mesh_elements, 0, sizeof(cufftComplex)*mesh_elements);
             }
 
@@ -369,7 +352,7 @@ void gpu_reduce_meshes(const unsigned int mesh_elements,
         d_mesh,
         ngpu);
     }
-   
+
 __global__ void gpu_compute_mesh_virial_kernel(const unsigned int n_wave_vectors,
                                          cufftComplex *d_fourier_mesh,
                                          Scalar *d_inf_f,
@@ -1230,13 +1213,6 @@ void gpu_compute_influence_function(const uint3 mesh_dim,
     #endif
     }
 
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-
-//! Texture for reading charge parameters
-scalar_tex_t pdata_charge_tex;
-
-
 //! The developer has chosen not to document this function
 __global__ void gpu_fix_exclusions_kernel(Scalar4 *d_force,
                                           Scalar *d_virial,
@@ -1259,10 +1235,10 @@ __global__ void gpu_fix_exclusions_kernel(Scalar4 *d_force,
         unsigned int idx = d_group_members[group_idx];
         const Scalar sqrtpi = sqrtf(M_PI);
         unsigned int n_neigh = d_n_neigh[idx];
-        Scalar4 postypei =  texFetchScalar4(d_pos, pdata_pos_tex, idx);
+        Scalar4 postypei =  __ldg(d_pos + idx);
         Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z);
 
-        Scalar qi = texFetchScalar(d_charge, pdata_charge_tex, idx);
+        Scalar qi = __ldg(d_charge + idx);
         // initialize the force to 0
         Scalar4 force = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
         Scalar virial[6];
@@ -1282,10 +1258,10 @@ __global__ void gpu_fix_exclusions_kernel(Scalar4 *d_force,
                         next_j = d_nlist[nli(idx, neigh_idx+1)];
 
                     // get the neighbor's position (MEM TRANSFER: 16 bytes)
-                    Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j);
+                    Scalar4 postypej = __ldg(d_pos + cur_j);
                     Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z);
 
-                    Scalar qj = texFetchScalar(d_charge, pdata_charge_tex, cur_j);
+                    Scalar qj = __ldg(d_charge + cur_j);
 
                     // calculate dr (with periodic boundary conditions) (FLOPS: 3)
                     Scalar3 dx = posi - posj;
@@ -1348,25 +1324,12 @@ cudaError_t gpu_fix_exclusions(Scalar4 *d_force,
                            Scalar alpha,
                            unsigned int *d_group_members,
                            unsigned int group_size,
-                           int block_size,
-                           const unsigned int compute_capability)
+                           int block_size)
     {
     dim3 grid( group_size / block_size + 1, 1, 1);
     dim3 threads(block_size, 1, 1);
 
-    // bind the textures on pre sm35 arches
-    if (compute_capability < 350)
-        {
-        cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4)*Nmax);
-        if (error != cudaSuccess)
-            return error;
-
-        error = cudaBindTexture(0, pdata_charge_tex, d_charge, sizeof(Scalar) * Nmax);
-        if (error != cudaSuccess)
-            return error;
-        }
-
-     gpu_fix_exclusions_kernel <<< grid, threads >>>  (d_force,
+    gpu_fix_exclusions_kernel <<< grid, threads >>>  (d_force,
                                                       d_virial,
                                                       virial_pitch,
                                                       d_pos,
diff --git a/hoomd/md/PPPMForceComputeGPU.cuh b/hoomd/md/PPPMForceComputeGPU.cuh
index e67f0c8fe0..79e5332931 100644
--- a/hoomd/md/PPPMForceComputeGPU.cuh
+++ b/hoomd/md/PPPMForceComputeGPU.cuh
@@ -30,7 +30,7 @@ void gpu_reduce_meshes(const unsigned int mesh_elements,
     cufftComplex *d_mesh,
     const unsigned int ngpu,
     const unsigned int block_size);
- 
+
 void gpu_compute_mesh_virial(const unsigned int n_wave_vectors,
                              cufftComplex *d_fourier_mesh,
                              Scalar *d_inf_f,
@@ -111,8 +111,7 @@ cudaError_t gpu_fix_exclusions(Scalar4 *d_force,
                            Scalar alpha,
                            unsigned int *d_group_members,
                            unsigned int group_size,
-                           int block_size,
-                           const unsigned int compute_capability);
+                           int block_size);
 
 void gpu_initialize_coeff(
     Scalar *CPU_rho_coeff,
diff --git a/hoomd/md/PotentialBondGPU.cuh b/hoomd/md/PotentialBondGPU.cuh
index 4871b43222..4b154e1e37 100644
--- a/hoomd/md/PotentialBondGPU.cuh
+++ b/hoomd/md/PotentialBondGPU.cuh
@@ -37,8 +37,7 @@ struct bond_args_t
               const Index2D & _gpu_table_indexer,
               const unsigned int *_d_gpu_n_bonds,
               const unsigned int _n_bond_types,
-              const unsigned int _block_size,
-              const unsigned int _compute_capability)
+              const unsigned int _block_size)
                 : d_force(_d_force),
                   d_virial(_d_virial),
                   virial_pitch(_virial_pitch),
@@ -52,8 +51,7 @@ struct bond_args_t
                   gpu_table_indexer(_gpu_table_indexer),
                   d_gpu_n_bonds(_d_gpu_n_bonds),
                   n_bond_types(_n_bond_types),
-                  block_size(_block_size),
-                  compute_capability(_compute_capability)
+                  block_size(_block_size)
         {
         };
 
@@ -71,18 +69,9 @@ struct bond_args_t
     const unsigned int *d_gpu_n_bonds; //!< List of number of bonds stored on the GPU
     const unsigned int n_bond_types;   //!< Number of bond types in the simulation
     const unsigned int block_size;     //!< Block size to execute
-    const unsigned int compute_capability;  //!< Compute capability of the device
     };
 
 #ifdef NVCC
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-
-//! Texture for reading particle diameters
-scalar_tex_t pdata_diam_tex;
-
-//! Texture for reading particle charges
-scalar_tex_t pdata_charge_tex;
 
 //! Kernel for calculating bond forces
 /*! This kernel is called to calculate the bond forces on all N particles. Actual evaluation of the potentials and
@@ -150,14 +139,14 @@ __global__ void gpu_compute_bond_forces_kernel(Scalar4 *d_force,
     int n_bonds =n_bonds_list[idx];
 
     // read in the position of our particle. (MEM TRANSFER: 16 bytes)
-    Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx);
+    Scalar4 postype = __ldg(d_pos + idx);
     Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z);
 
     // read in the diameter of our particle if needed
     Scalar diam(0);
     if (evaluator::needsDiameter())
         {
-        diam = texFetchScalar(d_diameter, pdata_diam_tex, idx);
+        diam = __ldg(d_diameter + idx);
         }
     else
         diam += 0; // shut up compiler warning
@@ -165,7 +154,7 @@ __global__ void gpu_compute_bond_forces_kernel(Scalar4 *d_force,
     Scalar q(0);
     if (evaluator::needsCharge())
         {
-        q = texFetchScalar(d_charge, pdata_charge_tex, idx);
+        q = __ldg(d_charge + idx);
         }
     else
         q += 0; // shut up compiler warning
@@ -186,7 +175,7 @@ __global__ void gpu_compute_bond_forces_kernel(Scalar4 *d_force,
         int cur_bond_type = cur_bond.idx[1];
 
         // get the bonded particle's position (MEM_TRANSFER: 16 bytes)
-        Scalar4 neigh_postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_bond_idx);
+        Scalar4 neigh_postypej = __ldg(d_pos + cur_bond_idx);
         Scalar3 neigh_pos= make_scalar3(neigh_postypej.x, neigh_postypej.y, neigh_postypej.z);
 
         // calculate dr (FLOPS: 3)
@@ -209,12 +198,12 @@ __global__ void gpu_compute_bond_forces_kernel(Scalar4 *d_force,
         // get the bonded particle's diameter if needed
         if (evaluator::needsDiameter())
             {
-            Scalar neigh_diam = texFetchScalar(d_diameter, pdata_diam_tex, cur_bond_idx);
+            Scalar neigh_diam = __ldg(d_diameter + cur_bond_idx);
             eval.setDiameter(diam, neigh_diam);
             }
         if (evaluator::needsCharge())
             {
-            Scalar neigh_q = texFetchScalar(d_charge, pdata_charge_tex, cur_bond_idx);
+            Scalar neigh_q = __ldg(d_charge + cur_bond_idx);
             eval.setCharge(q, neigh_q);
             }
 
@@ -286,29 +275,6 @@ cudaError_t gpu_compute_bond_forces(const bond_args_t& bond_args,
     dim3 grid( bond_args.N / run_block_size + 1, 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the position texture on pre sm35 devices
-    if (bond_args.compute_capability < 350)
-        {
-        pdata_pos_tex.normalized = false;
-        pdata_pos_tex.filterMode = cudaFilterModePoint;
-        cudaError_t error = cudaBindTexture(0, pdata_pos_tex, bond_args.d_pos, sizeof(Scalar4)*(bond_args.n_max));
-        if (error != cudaSuccess)
-            return error;
-
-        // bind the diameter texture
-        pdata_diam_tex.normalized = false;
-        pdata_diam_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, pdata_diam_tex, bond_args.d_diameter, sizeof(Scalar) *(bond_args.n_max));
-        if (error != cudaSuccess)
-            return error;
-
-        pdata_charge_tex.normalized = false;
-        pdata_charge_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, pdata_charge_tex, bond_args.d_charge, sizeof(Scalar) * (bond_args.n_max));
-        if (error != cudaSuccess)
-            return error;
-        }
-
     unsigned int shared_bytes = sizeof(typename evaluator::param_type) *
                                 bond_args.n_bond_types;
 
diff --git a/hoomd/md/PotentialBondGPU.h b/hoomd/md/PotentialBondGPU.h
index b1258171e2..06e0a7dac3 100644
--- a/hoomd/md/PotentialBondGPU.h
+++ b/hoomd/md/PotentialBondGPU.h
@@ -139,8 +139,7 @@ void PotentialBondGPU< evaluator, gpu_cgbf >::computeForces(unsigned int timeste
                              gpu_table_indexer,
                              d_gpu_n_bonds.data,
                              this->m_bond_data->getNTypes(),
-                             this->m_tuner->getParam(),
-                             this->m_exec_conf->getComputeCapability()),
+                             this->m_tuner->getParam()),
                  d_params.data,
                  d_flags.data);
         }
diff --git a/hoomd/md/PotentialPairDPDThermoGPU.cuh b/hoomd/md/PotentialPairDPDThermoGPU.cuh
index 2b4e6f304f..2818607b3e 100644
--- a/hoomd/md/PotentialPairDPDThermoGPU.cuh
+++ b/hoomd/md/PotentialPairDPDThermoGPU.cuh
@@ -49,9 +49,7 @@ struct dpd_pair_args_t
                     const Scalar _T,
                     const unsigned int _shift_mode,
                     const unsigned int _compute_virial,
-                    const unsigned int _threads_per_particle,
-                    const unsigned int _compute_capability,
-                    const unsigned int _max_tex1d_width)
+                    const unsigned int _threads_per_particle)
                         : d_force(_d_force),
                         d_virial(_d_virial),
                         virial_pitch(_virial_pitch),
@@ -74,9 +72,7 @@ struct dpd_pair_args_t
                         T(_T),
                         shift_mode(_shift_mode),
                         compute_virial(_compute_virial),
-                        threads_per_particle(_threads_per_particle),
-                        compute_capability(_compute_capability),
-                        max_tex1d_width(_max_tex1d_width)
+                        threads_per_particle(_threads_per_particle)
         {
         };
 
@@ -103,22 +99,9 @@ struct dpd_pair_args_t
     const unsigned int shift_mode;  //!< The potential energy shift mode
     const unsigned int compute_virial;  //!< Flag to indicate if virials should be computed
     const unsigned int threads_per_particle; //!< Number of threads per particle (maximum: 32==1 warp)
-    const unsigned int compute_capability;  //!< Compute capability of the device (20, 30, 35, ...)
-    const unsigned int max_tex1d_width;     //!< Maximum width of a 1d linear texture
     };
 
 #ifdef NVCC
-//! Texture for reading particle positions
-scalar4_tex_t pdata_dpd_pos_tex;
-
-//! Texture for reading particle velocities
-scalar4_tex_t pdata_dpd_vel_tex;
-
-//! Texture for reading particle tags
-texture<unsigned int, 1, cudaReadModeElementType> pdata_dpd_tag_tex;
-
-//! Texture for reading neighbor list
-texture<unsigned int, 1, cudaReadModeElementType> nlist_tex;
 
 //! Kernel for calculating pair forces
 /*! This kernel is called to calculate the pair forces on all N particles. Actual evaluation of the potentials and
@@ -153,8 +136,6 @@ texture<unsigned int, 1, cudaReadModeElementType> nlist_tex;
     \tparam evaluator EvaluatorPair class to evaluate V(r) and -delta V(r)/r
     \tparam shift_mode 0: No energy shifting is done. 1: V(r) is shifted to be 0 at rcut.
     \tparam compute_virial When non-zero, the virial tensor is computed. When zero, the virial tensor is not computed.
-    \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg
-                           is used depending on architecture.
 
     <b>Implementation details</b>
     Each block will calculate the forces on a block of particles.
@@ -222,26 +203,19 @@ __global__ void gpu_compute_dpd_forces_kernel(Scalar4 *d_force,
 
         // read in the position of our particle.
         // (MEM TRANSFER: 16 bytes)
-        Scalar4 postypei = texFetchScalar4(d_pos, pdata_dpd_pos_tex, idx);
+        Scalar4 postypei = __ldg(d_pos + idx);
         Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z);
 
         // read in the velocity of our particle.
         // (MEM TRANSFER: 16 bytes)
-        Scalar4 velmassi = texFetchScalar4(d_vel, pdata_dpd_vel_tex, idx);
+        Scalar4 velmassi = __ldg(d_vel + idx);
         Scalar3 veli = make_scalar3(velmassi.x, velmassi.y, velmassi.z);
 
         // prefetch neighbor index
         const unsigned int head_idx = d_head_list[idx];
         unsigned int cur_j = 0;
         unsigned int next_j(0);
-        if (use_gmem_nlist)
-            {
-            next_j = (threadIdx.x%tpp < n_neigh) ? d_nlist[head_idx + threadIdx.x%tpp] : 0;
-            }
-        else
-            {
-            next_j = (threadIdx.x%tpp < n_neigh) ? texFetchUint(d_nlist, nlist_tex, head_idx + threadIdx.x%tpp) : 0;
-            }
+        next_j = (threadIdx.x%tpp < n_neigh) ? __ldg(d_nlist + head_idx + threadIdx.x%tpp) : 0;
 
         // this particle's tag
         unsigned int tagi = d_tag[idx];
@@ -255,22 +229,15 @@ __global__ void gpu_compute_dpd_forces_kernel(Scalar4 *d_force,
                 cur_j = next_j;
                 if (neigh_idx+tpp < n_neigh)
                     {
-                    if (use_gmem_nlist)
-                        {
-                        next_j = d_nlist[head_idx + neigh_idx + tpp];
-                        }
-                    else
-                        {
-                        next_j = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + tpp);
-                        }
+                    next_j = __ldg(d_nlist + head_idx + neigh_idx + tpp);
                     }
 
                 // get the neighbor's position (MEM TRANSFER: 16 bytes)
-                Scalar4 postypej = texFetchScalar4(d_pos, pdata_dpd_pos_tex, cur_j);
+                Scalar4 postypej = __ldg(d_pos + cur_j);
                 Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z);
 
                 // get the neighbor's position (MEM TRANSFER: 16 bytes)
-                Scalar4 velmassj = texFetchScalar4(d_vel, pdata_dpd_vel_tex, cur_j);
+                Scalar4 velmassj = __ldg(d_vel + cur_j);
                 Scalar3 velj = make_scalar3(velmassj.x, velmassj.y, velmassj.z);
 
                 // calculate dr (with periodic boundary conditions) (FLOPS: 3)
@@ -308,7 +275,7 @@ __global__ void gpu_compute_dpd_forces_kernel(Scalar4 *d_force,
 
                 // Special Potential Pair DPD Requirements
                 // use particle i's and j's tags
-                unsigned int tagj = texFetchUint(d_tag, pdata_dpd_tag_tex, cur_j);
+                unsigned int tagj = __ldg(d_tag + cur_j);
                 eval.set_seed_ij_timestep(d_seed,tagi,tagj,d_timestep);
                 eval.setDeltaT(d_deltaT);
                 eval.setRDotV(rdotv);
@@ -374,30 +341,6 @@ int dpd_get_max_block_size(T func)
     return max_threads;
     }
 
-inline void gpu_dpd_pair_force_bind_textures(const dpd_pair_args_t pair_args)
-    {
-    // bind the position texture
-    pdata_dpd_pos_tex.normalized = false;
-    pdata_dpd_pos_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, pdata_dpd_pos_tex, pair_args.d_pos, sizeof(Scalar4)*pair_args.n_max);
-
-    // bind the diameter texture
-    pdata_dpd_vel_tex.normalized = false;
-    pdata_dpd_vel_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, pdata_dpd_vel_tex, pair_args.d_vel, sizeof(Scalar4) * pair_args.n_max);
-
-    pdata_dpd_tag_tex.normalized = false;
-    pdata_dpd_tag_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, pdata_dpd_tag_tex, pair_args.d_tag, sizeof(unsigned int) * pair_args.n_max);
-
-    if (pair_args.size_nlist <= pair_args.max_tex1d_width)
-        {
-        nlist_tex.normalized = false;
-        nlist_tex.filterMode = cudaFilterModePoint;
-        cudaBindTexture(0, nlist_tex, pair_args.d_nlist, sizeof(unsigned int) * pair_args.size_nlist);
-        }
-    }
-
 //! DPD force compute kernel launcher
 /*!
  * \tparam evaluator EvaluatorPair class to evualuate V(r) and -delta V(r)/r
@@ -434,8 +377,6 @@ struct DPDForceComputeKernel
             if (max_block_size == UINT_MAX)
                 max_block_size = dpd_get_max_block_size(gpu_compute_dpd_forces_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>);
 
-            if (args.compute_capability < 35) gpu_dpd_pair_force_bind_textures(args);
-
             block_size = block_size < max_block_size ? block_size : max_block_size;
             dim3 grid(args.N / (block_size/tpp) + 1, 1, 1);
 
@@ -492,82 +433,40 @@ cudaError_t gpu_compute_dpd_forces(const dpd_pair_args_t& args,
     assert(args.ntypes > 0);
 
     // run the kernel
-    if (args.compute_capability < 35 && args.size_nlist > args.max_tex1d_width)
+    if (args.compute_virial)
         {
-        if (args.compute_virial)
+        switch (args.shift_mode)
             {
-            switch (args.shift_mode)
+            case 0:
                 {
-                case 0:
-                    {
-                    DPDForceComputeKernel<evaluator, 0, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
-                    break;
-                    }
-                case 1:
-                    {
-                    DPDForceComputeKernel<evaluator, 1, 1, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
-                    break;
-                    }
-                default:
-                    return cudaErrorUnknown;
+                DPDForceComputeKernel<evaluator, 0, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
+                break;
                 }
-            }
-        else
-            {
-            switch (args.shift_mode)
+            case 1:
                 {
-                case 0:
-                    {
-                    DPDForceComputeKernel<evaluator, 0, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
-                    break;
-                    }
-                case 1:
-                    {
-                    DPDForceComputeKernel<evaluator, 1, 0, 1, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
-                    break;
-                    }
-                default:
-                    return cudaErrorUnknown;
+                DPDForceComputeKernel<evaluator, 1, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
+                break;
                 }
+            default:
+                return cudaErrorUnknown;
             }
         }
     else
         {
-        if (args.compute_virial)
+        switch (args.shift_mode)
             {
-            switch (args.shift_mode)
+            case 0:
                 {
-                case 0:
-                    {
-                    DPDForceComputeKernel<evaluator, 0, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
-                    break;
-                    }
-                case 1:
-                    {
-                    DPDForceComputeKernel<evaluator, 1, 1, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
-                    break;
-                    }
-                default:
-                    return cudaErrorUnknown;
+                DPDForceComputeKernel<evaluator, 0, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
+                break;
                 }
-            }
-        else
-            {
-            switch (args.shift_mode)
+            case 1:
                 {
-                case 0:
-                    {
-                    DPDForceComputeKernel<evaluator, 0, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
-                    break;
-                    }
-                case 1:
-                    {
-                    DPDForceComputeKernel<evaluator, 1, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
-                    break;
-                    }
-                default:
-                    return cudaErrorUnknown;
+                DPDForceComputeKernel<evaluator, 1, 0, 0, gpu_dpd_pair_force_max_tpp>::launch(args, d_params);
+                break;
                 }
+            default:
+                return cudaErrorUnknown;
             }
         }
 
diff --git a/hoomd/md/PotentialPairDPDThermoGPU.h b/hoomd/md/PotentialPairDPDThermoGPU.h
index dcba7d7d70..fab74fed83 100644
--- a/hoomd/md/PotentialPairDPDThermoGPU.h
+++ b/hoomd/md/PotentialPairDPDThermoGPU.h
@@ -179,9 +179,7 @@ void PotentialPairDPDThermoGPU< evaluator, gpu_cpdf >::computeForces(unsigned in
                              this->m_T->getValue(timestep),
                              this->m_shift_mode,
                              flags[pdata_flag::pressure_tensor] || flags[pdata_flag::isotropic_virial],
-                             threads_per_particle,
-                             this->m_exec_conf->getComputeCapability()/10,
-                             this->m_exec_conf->dev_prop.maxTexture1DLinear),
+                             threads_per_particle),
              d_params.data);
 
     if (this->m_exec_conf->isCUDAErrorCheckingEnabled())
diff --git a/hoomd/md/PotentialPairGPU.cuh b/hoomd/md/PotentialPairGPU.cuh
index ab9c1c2e45..357c9b127d 100644
--- a/hoomd/md/PotentialPairGPU.cuh
+++ b/hoomd/md/PotentialPairGPU.cuh
@@ -53,8 +53,6 @@ struct pair_args_t
               const unsigned int _shift_mode,
               const unsigned int _compute_virial,
               const unsigned int _threads_per_particle,
-              const unsigned int _compute_capability,
-              const unsigned int _max_tex1d_width,
               const GPUPartition& _gpu_partition)
                 : d_force(_d_force),
                   d_virial(_d_virial),
@@ -76,8 +74,6 @@ struct pair_args_t
                   shift_mode(_shift_mode),
                   compute_virial(_compute_virial),
                   threads_per_particle(_threads_per_particle),
-                  compute_capability(_compute_capability),
-                  max_tex1d_width(_max_tex1d_width),
                   gpu_partition(_gpu_partition)
         {
         };
@@ -102,25 +98,10 @@ struct pair_args_t
     const unsigned int shift_mode;  //!< The potential energy shift mode
     const unsigned int compute_virial;  //!< Flag to indicate if virials should be computed
     const unsigned int threads_per_particle; //!< Number of threads per particle (maximum: 1 warp)
-    const unsigned int compute_capability;  //!< Compute capability (20 30 35, ...)
-    const unsigned int max_tex1d_width;     //!< Maximum width of a linear 1D texture
     const GPUPartition& gpu_partition;      //!< The load balancing partition of particles between GPUs
     };
 
 #ifdef NVCC
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-
-//! Texture for reading particle diameters
-scalar_tex_t pdata_diam_tex;
-
-//! Texture for reading particle charges
-scalar_tex_t pdata_charge_tex;
-
-// there is some naming conflict between the DPD pair force and PotentialPair because
-// the DPD does not extend PotentialPair, and so we need to choose a different name for this texture
-//! Texture for reading neighbor list
-texture<unsigned int, 1, cudaReadModeElementType> pair_nlist_tex;
 
 //! Kernel for calculating pair forces
 /*! This kernel is called to calculate the pair forces on all N particles. Actual evaluation of the potentials and
@@ -153,8 +134,6 @@ texture<unsigned int, 1, cudaReadModeElementType> pair_nlist_tex;
     \tparam shift_mode 0: No energy shifting is done. 1: V(r) is shifted to be 0 at rcut. 2: XPLOR switching is enabled
                        (See PotentialPair for a discussion on what that entails)
     \tparam compute_virial When non-zero, the virial tensor is computed. When zero, the virial tensor is not computed.
-    \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg
-                           is used depending on architecture.
     \tparam tpp Number of threads to use per particle, must be power of 2 and smaller than warp size
 
     <b>Implementation details</b>
@@ -162,7 +141,7 @@ texture<unsigned int, 1, cudaReadModeElementType> pair_nlist_tex;
     Each group of \a tpp threads will calculate the total force on one particle.
     The neighborlist is arranged in columns so that reads are fully coalesced when doing this.
 */
-template< class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist, int tpp>
+template< class evaluator, unsigned int shift_mode, unsigned int compute_virial, int tpp>
 __global__ void gpu_compute_pair_forces_shared_kernel(Scalar4 *d_force,
                                                Scalar *d_virial,
                                                const unsigned int virial_pitch,
@@ -231,17 +210,17 @@ __global__ void gpu_compute_pair_forces_shared_kernel(Scalar4 *d_force,
 
         // read in the position of our particle.
         // (MEM TRANSFER: 16 bytes)
-        Scalar4 postypei = texFetchScalar4(d_pos, pdata_pos_tex, idx);
+        Scalar4 postypei = __ldg(d_pos + idx);
         Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z);
 
         Scalar di;
         if (evaluator::needsDiameter())
-            di = texFetchScalar(d_diameter, pdata_diam_tex, idx);
+            di = __ldg(d_diameter + idx);
         else
             di += Scalar(1.0); // shut up compiler warning
         Scalar qi;
         if (evaluator::needsCharge())
-            qi = texFetchScalar(d_charge, pdata_charge_tex, idx);
+            qi = __ldg(d_charge + idx);
         else
             qi += Scalar(1.0); // shut up compiler warning
 
@@ -249,14 +228,7 @@ __global__ void gpu_compute_pair_forces_shared_kernel(Scalar4 *d_force,
         unsigned int cur_j = 0;
 
         unsigned int next_j(0);
-        if (use_gmem_nlist)
-            {
-            next_j = (threadIdx.x%tpp < n_neigh) ? d_nlist[my_head + threadIdx.x%tpp] : 0;
-            }
-        else
-            {
-            next_j = threadIdx.x%tpp < n_neigh ? texFetchUint(d_nlist, pair_nlist_tex, my_head + threadIdx.x%tpp) : 0;
-            }
+        next_j = threadIdx.x%tpp < n_neigh ? __ldg(d_nlist + my_head + threadIdx.x%tpp) : 0;
 
         // loop over neighbors
         for (int neigh_idx = threadIdx.x%tpp; neigh_idx < n_neigh; neigh_idx+=tpp)
@@ -266,28 +238,21 @@ __global__ void gpu_compute_pair_forces_shared_kernel(Scalar4 *d_force,
                 cur_j = next_j;
                 if (neigh_idx+tpp < n_neigh)
                     {
-                    if (use_gmem_nlist)
-                        {
-                        next_j = d_nlist[my_head + neigh_idx + tpp];
-                        }
-                    else
-                        {
-                        next_j = texFetchUint(d_nlist, pair_nlist_tex, my_head + neigh_idx+tpp);
-                        }
+                    next_j = __ldg(d_nlist + my_head + neigh_idx+tpp);
                     }
                 // get the neighbor's position (MEM TRANSFER: 16 bytes)
-                Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j);
+                Scalar4 postypej = __ldg(d_pos + cur_j);
                 Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z);
 
                 Scalar dj = Scalar(0.0);
                 if (evaluator::needsDiameter())
-                    dj = texFetchScalar(d_diameter, pdata_diam_tex, cur_j);
+                    dj = __ldg(d_diameter + cur_j);
                 else
                     dj += Scalar(1.0); // shut up compiler warning
 
                 Scalar qj = Scalar(0.0);
                 if (evaluator::needsCharge())
-                    qj = texFetchScalar(d_charge, pdata_charge_tex, cur_j);
+                    qj = __ldg(d_charge + cur_j);
                 else
                     qj += Scalar(1.0); // shut up compiler warning
 
@@ -423,57 +388,18 @@ int get_max_block_size(T func)
     return max_threads;
     }
 
-inline void gpu_pair_force_bind_textures(const pair_args_t pair_args)
-    {
-    // bind the position texture
-    pdata_pos_tex.normalized = false;
-    pdata_pos_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, pdata_pos_tex, pair_args.d_pos, sizeof(Scalar4)*pair_args.n_max);
-
-    // bind the diameter texture
-    pdata_diam_tex.normalized = false;
-    pdata_diam_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, pdata_diam_tex, pair_args.d_diameter, sizeof(Scalar) * pair_args.n_max);
-
-    pdata_charge_tex.normalized = false;
-    pdata_charge_tex.filterMode = cudaFilterModePoint;
-    cudaBindTexture(0, pdata_charge_tex, pair_args.d_charge, sizeof(Scalar) * pair_args.n_max);
-
-    // bind the neighborlist texture if it will fit
-    if (pair_args.size_neigh_list <= pair_args.max_tex1d_width)
-        {
-        pair_nlist_tex.normalized = false;
-        pair_nlist_tex.filterMode = cudaFilterModePoint;
-        cudaBindTexture(0, pair_nlist_tex, pair_args.d_nlist, sizeof(unsigned int) * pair_args.size_neigh_list);
-        }
-    }
-
-inline void gpu_pair_force_unbind_textures(const pair_args_t pair_args)
-    {
-    cudaUnbindTexture(pdata_pos_tex);
-    cudaUnbindTexture(pdata_diam_tex);
-    cudaUnbindTexture(pdata_charge_tex);
-
-    if (pair_args.size_neigh_list <= pair_args.max_tex1d_width)
-        {
-        cudaUnbindTexture(pair_nlist_tex);
-        }
-    }
-
 //! Pair force compute kernel launcher
 /*!
  * \tparam evaluator EvaluatorPair class to evaluate V(r) and -delta V(r)/r
  * \tparam shift_mode 0: No energy shifting is done. 1: V(r) is shifted to be 0 at rcut. 2: XPLOR switching is enabled
  *                       (See PotentialPair for a discussion on what that entails)
  * \tparam compute_virial When non-zero, the virial tensor is computed. When zero, the virial tensor is not computed.
- * \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg
- *                        is used depending on architecture.
  * \tparam tpp Number of threads to use per particle, must be power of 2 and smaller than warp size
  *
  * Partial function template specialization is not allowed in C++, so instead we have to wrap this with a struct that
  * we are allowed to partially specialize.
  */
-template<class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist, int tpp>
+template<class evaluator, unsigned int shift_mode, unsigned int compute_virial, int tpp>
 struct PairForceComputeKernel
     {
     //! Launcher for the pair force kernel
@@ -501,31 +427,27 @@ struct PairForceComputeKernel
 
             static unsigned int max_block_size = UINT_MAX;
             if (max_block_size == UINT_MAX)
-                max_block_size = get_max_block_size(gpu_compute_pair_forces_shared_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>);
-
-            if (pair_args.compute_capability < 35) gpu_pair_force_bind_textures(pair_args);
+                max_block_size = get_max_block_size(gpu_compute_pair_forces_shared_kernel<evaluator, shift_mode, compute_virial, tpp>);
 
             block_size = block_size < max_block_size ? block_size : max_block_size;
             dim3 grid(N / (block_size/tpp) + 1, 1, 1);
 
-            gpu_compute_pair_forces_shared_kernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp>
+            gpu_compute_pair_forces_shared_kernel<evaluator, shift_mode, compute_virial, tpp>
               <<<grid, block_size, shared_bytes>>>(pair_args.d_force, pair_args.d_virial,
               pair_args.virial_pitch, N, pair_args.d_pos, pair_args.d_diameter,
               pair_args.d_charge, pair_args.box, pair_args.d_n_neigh, pair_args.d_nlist,
               pair_args.d_head_list, d_params, pair_args.d_rcutsq, pair_args.d_ronsq, pair_args.ntypes, offset);
-
-            if (pair_args.compute_capability < 35) gpu_pair_force_unbind_textures(pair_args);
             }
         else
             {
-            PairForceComputeKernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, tpp/2>::launch(pair_args, range, d_params);
+            PairForceComputeKernel<evaluator, shift_mode, compute_virial, tpp/2>::launch(pair_args, range, d_params);
             }
         }
     };
 
 //! Template specialization to do nothing for the tpp = 0 case
-template<class evaluator, unsigned int shift_mode, unsigned int compute_virial, unsigned int use_gmem_nlist>
-struct PairForceComputeKernel<evaluator, shift_mode, compute_virial, use_gmem_nlist, 0>
+template<class evaluator, unsigned int shift_mode, unsigned int compute_virial>
+struct PairForceComputeKernel<evaluator, shift_mode, compute_virial, 0>
     {
     static void launch(const pair_args_t& pair_args, std::pair<unsigned int, unsigned int> range, const typename evaluator::param_type *d_params)
         {
@@ -554,102 +476,50 @@ cudaError_t gpu_compute_pair_forces(const pair_args_t& pair_args,
         auto range = pair_args.gpu_partition.getRangeAndSetGPU(idev);
 
         // Launch kernel
-        if (pair_args.compute_capability < 35 && pair_args.size_neigh_list > pair_args.max_tex1d_width)
-            { // fall back to slow global loads when the neighbor list is too big for texture memory
-            if (pair_args.compute_virial)
+        if (pair_args.compute_virial)
+            {
+            switch (pair_args.shift_mode)
                 {
-                switch (pair_args.shift_mode)
+                case 0:
                     {
-                    case 0:
-                        {
-                        PairForceComputeKernel<evaluator, 0, 1, 1, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    case 1:
-                        {
-                        PairForceComputeKernel<evaluator, 1, 1, 1, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    case 2:
-                        {
-                        PairForceComputeKernel<evaluator, 2, 1, 1, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    default:
-                        break;
+                    PairForceComputeKernel<evaluator, 0, 1, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
+                    break;
                     }
-                }
-            else
-                {
-                switch (pair_args.shift_mode)
+                case 1:
                     {
-                    case 0:
-                        {
-                        PairForceComputeKernel<evaluator, 0, 0, 1, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    case 1:
-                        {
-                        PairForceComputeKernel<evaluator, 1, 0, 1, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    case 2:
-                        {
-                        PairForceComputeKernel<evaluator, 2, 0, 1, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    default:
-                        break;
+                    PairForceComputeKernel<evaluator, 1, 1, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
+                    break;
                     }
+                case 2:
+                    {
+                    PairForceComputeKernel<evaluator, 2, 1, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
+                    break;
+                    }
+                default:
+                    break;
                 }
             }
         else
             {
-            if (pair_args.compute_virial)
+            switch (pair_args.shift_mode)
                 {
-                switch (pair_args.shift_mode)
+                case 0:
                     {
-                    case 0:
-                        {
-                        PairForceComputeKernel<evaluator, 0, 1, 0, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    case 1:
-                        {
-                        PairForceComputeKernel<evaluator, 1, 1, 0, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    case 2:
-                        {
-                        PairForceComputeKernel<evaluator, 2, 1, 0, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    default:
-                        break;
+                    PairForceComputeKernel<evaluator, 0, 0, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
+                    break;
                     }
-                }
-            else
-                {
-                switch (pair_args.shift_mode)
+                case 1:
                     {
-                    case 0:
-                        {
-                        PairForceComputeKernel<evaluator, 0, 0, 0, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    case 1:
-                        {
-                        PairForceComputeKernel<evaluator, 1, 0, 0, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    case 2:
-                        {
-                        PairForceComputeKernel<evaluator, 2, 0, 0, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
-                        break;
-                        }
-                    default:
-                        break;
+                    PairForceComputeKernel<evaluator, 1, 0, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
+                    break;
+                    }
+                case 2:
+                    {
+                    PairForceComputeKernel<evaluator, 2, 0, gpu_pair_force_max_tpp>::launch(pair_args, range, d_params);
+                    break;
                     }
+                default:
+                    break;
                 }
             }
         }
diff --git a/hoomd/md/PotentialPairGPU.h b/hoomd/md/PotentialPairGPU.h
index 57cd3773ab..519a825e4e 100644
--- a/hoomd/md/PotentialPairGPU.h
+++ b/hoomd/md/PotentialPairGPU.h
@@ -188,8 +188,6 @@ void PotentialPairGPU< evaluator, gpu_cgpf >::computeForces(unsigned int timeste
                          this->m_shift_mode,
                          flags[pdata_flag::pressure_tensor] || flags[pdata_flag::isotropic_virial],
                          threads_per_particle,
-                         this->m_exec_conf->getComputeCapability()/10,
-                         this->m_exec_conf->dev_prop.maxTexture1DLinear,
                          this->m_pdata->getGPUPartition()),
              d_params.data);
 
diff --git a/hoomd/md/PotentialSpecialPairGPU.h b/hoomd/md/PotentialSpecialPairGPU.h
index 75f1c03b94..fddc6012a2 100644
--- a/hoomd/md/PotentialSpecialPairGPU.h
+++ b/hoomd/md/PotentialSpecialPairGPU.h
@@ -140,8 +140,7 @@ void PotentialSpecialPairGPU< evaluator, gpu_cgbf >::computeForces(unsigned int
                              gpu_table_indexer,
                              d_gpu_n_bonds.data,
                              this->m_pair_data->getNTypes(),
-                             this->m_tuner->getParam(),
-                             this->m_exec_conf->getComputeCapability()),
+                             this->m_tuner->getParam()),
                  d_params.data,
                  d_flags.data);
         }
diff --git a/hoomd/md/PotentialTersoffGPU.cuh b/hoomd/md/PotentialTersoffGPU.cuh
index 5af4067b16..f86dd68e67 100644
--- a/hoomd/md/PotentialTersoffGPU.cuh
+++ b/hoomd/md/PotentialTersoffGPU.cuh
@@ -42,7 +42,6 @@ struct tersoff_args_t
                    const unsigned int _ntypes,
                    const unsigned int _block_size,
                    const unsigned int _tpp,
-                   const unsigned int _compute_capability,
                    const cudaDeviceProp& _devprop)
                    : d_force(_d_force),
                      N(_N),
@@ -61,7 +60,6 @@ struct tersoff_args_t
                      ntypes(_ntypes),
                      block_size(_block_size),
                      tpp(_tpp),
-                     compute_capability(_compute_capability),
                      devprop(_devprop)
         {
         };
@@ -83,17 +81,11 @@ struct tersoff_args_t
     const unsigned int ntypes;      //!< Number of particle types in the simulation
     const unsigned int block_size;  //!< Block size to execute
     const unsigned int tpp;         //!< Threads per particle
-    const unsigned int compute_capability; //!< GPU compute capability (20, 30, 35, ...)
     const cudaDeviceProp& devprop;   //!< CUDA device properties
     };
 
 
 #ifdef NVCC
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-
-//! Texture for reading neighbor list
-texture<unsigned int, 1, cudaReadModeElementType> nlist_tex;
 
 #if !defined(SINGLE_PRECISION)
 
@@ -154,15 +146,13 @@ __device__ float myAtomicAdd(float* address, float val)
 
     Certain options are controlled via template parameters to avoid the performance hit when they are not enabled.
     \tparam evaluator EvaluatorPair class to evaluate V(r) and -delta V(r)/r
-    \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg
-                           is used depending on architecture.
 
     <b>Implementation details</b>
     Each block will calculate the forces on a block of particles.
     Each thread will calculate the total force on one particle.
     The neighborlist is arranged in columns so that reads are fully coalesced when doing this.
 */
-template< class evaluator , unsigned char use_gmem_nlist, unsigned char compute_virial, int tpp>
+template< class evaluator, unsigned char compute_virial, int tpp>
 __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
                                                   const unsigned int N,
                                                   Scalar *d_virial,
@@ -216,7 +206,7 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
     unsigned int n_neigh = d_n_neigh[idx];
 
     // read in the position of the particle
-    Scalar4 postypei = texFetchScalar4(d_pos, pdata_pos_tex, idx);
+    Scalar4 postypei = __ldg(d_pos + idx);
     Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z);
 
     // initialize the force to 0
@@ -238,14 +228,7 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
         unsigned int next_j(0);
         unsigned int my_head = d_head_list[idx];
 
-        if (use_gmem_nlist)
-            {
-            next_j = (threadIdx.x%tpp < n_neigh) ? d_nlist[my_head + threadIdx.x%tpp] : 0;
-            }
-        else
-            {
-            next_j = threadIdx.x%tpp < n_neigh ? texFetchUint(d_nlist, nlist_tex, my_head + threadIdx.x%tpp) : 0;
-            }
+        next_j = threadIdx.x%tpp < n_neigh ? __ldg(d_nlist + my_head + threadIdx.x%tpp) : 0;
 
         // loop over neighbors in strided way
         for (int neigh_idx = threadIdx.x%tpp; neigh_idx < n_neigh; neigh_idx+=tpp)
@@ -255,18 +238,11 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
             cur_j = next_j;
             if (neigh_idx+tpp < n_neigh)
                 {
-                if (use_gmem_nlist)
-                    {
-                    next_j = d_nlist[head_idx + neigh_idx + tpp];
-                    }
-                else
-                    {
-                    next_j = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + tpp);
-                    }
+                next_j = __ldg(d_nlist + head_idx + neigh_idx + tpp);
                 }
 
             // read the position of j (MEM TRANSFER: 16 bytes)
-            Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j);
+            Scalar4 postypej = __ldg(d_pos + cur_j);
             Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z);
 
             // initialize the force on j
@@ -295,13 +271,11 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
             {
             Scalar phi = s_phi_ab[threadIdx.x*ntypes+typ_b];
 
-            #if (__CUDA_ARCH__ >= 300)
             // reduce in warp
             phi = hoomd::detail::WarpReduce<Scalar, tpp>().Sum(phi);
 
             // broadcast into shared mem
             s_phi_ab[threadIdx.x*ntypes+typ_b] = hoomd::detail::WarpScan<Scalar, tpp>().Broadcast(phi, 0);
-            #endif
 
             if (threadIdx.x % tpp == 0)
                 {
@@ -324,14 +298,7 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
     unsigned int next_j(0);
     unsigned int my_head = d_head_list[idx];
 
-    if (use_gmem_nlist)
-        {
-        next_j = (threadIdx.x%tpp < n_neigh) ? d_nlist[my_head + threadIdx.x%tpp] : 0;
-        }
-    else
-        {
-        next_j = threadIdx.x%tpp < n_neigh ? texFetchUint(d_nlist, nlist_tex, my_head + threadIdx.x%tpp) : 0;
-        }
+    next_j = threadIdx.x%tpp < n_neigh ? __ldg(d_nlist + my_head + threadIdx.x%tpp) : 0;
 
     // loop over neighbors in strided way
     for (int neigh_idx = threadIdx.x%tpp; neigh_idx < n_neigh; neigh_idx+=tpp)
@@ -341,18 +308,11 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
         cur_j = next_j;
         if (neigh_idx+tpp < n_neigh)
             {
-            if (use_gmem_nlist)
-                {
-                next_j = d_nlist[head_idx + neigh_idx + tpp];
-                }
-            else
-                {
-                next_j = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + tpp);
-                }
+            next_j = __ldg(d_nlist + head_idx + neigh_idx + tpp);
             }
 
         // read the position of j (MEM TRANSFER: 16 bytes)
-        Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j);
+        Scalar4 postypej = __ldg(d_pos + cur_j);
         Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z);
 
         // initialize the force on j
@@ -394,31 +354,17 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
                 // compute chi
                 unsigned int cur_k = 0;
                 unsigned int next_k(0);
-                if (use_gmem_nlist)
-                    {
-                    next_k = d_nlist[head_idx];
-                    }
-                else
-                    {
-                    next_k = texFetchUint(d_nlist, nlist_tex, head_idx);
-                    }
+                next_k = __ldg(d_nlist + head_idx);
 
                 // loop over neighbors one by one
                 for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++)
                     {
                     // read the current index of k and prefetch the next one
                     cur_k = next_k;
-                    if (use_gmem_nlist)
-                        {
-                        next_k = d_nlist[head_idx + neigh_idy + 1];
-                        }
-                    else
-                        {
-                        next_k = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idy+1);
-                        }
+                    next_k = __ldg(d_nlist + head_idx + neigh_idy+1);
 
                     // get the position of neighbor k
-                    Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
+                    Scalar4 postypek = __ldg(d_pos + cur_k);
                     Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
 
                     // get the type pair parameters for i and k
@@ -506,31 +452,17 @@ __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
                 // now evaluate the force from the ik interactions
                 unsigned int cur_k = 0;
                 unsigned int next_k(0);
-                if (use_gmem_nlist)
-                    {
-                    next_k = d_nlist[head_idx];
-                    }
-                else
-                    {
-                    next_k = texFetchUint(d_nlist, nlist_tex, head_idx);
-                    }
+                next_k = __ldg(d_nlist + head_idx);
 
                 // loop over neighbors one by one
                 for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++)
                     {
                     // read the current neighbor index and prefetch the next one
                     cur_k = next_k;
-                    if (use_gmem_nlist)
-                        {
-                        next_k = d_nlist[head_idx + neigh_idy + 1];
-                        }
-                    else
-                        {
-                        next_k = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idy+1);
-                        }
+                    next_k = __ldg(d_nlist + head_idx + neigh_idy+1);
 
                     // get the position of neighbor k
-                    Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
+                    Scalar4 postypek = __ldg(d_pos + cur_k);
                     Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
 
                     // get the type pair parameters for i and k
@@ -716,7 +648,7 @@ void get_max_block_size(T func, const tersoff_args_t& pair_args, unsigned int& m
  * Partial function template specialization is not allowed in C++, so instead we have to wrap this with a struct that
  * we are allowed to partially specialize.
  */
-template<class evaluator, unsigned int use_gmem_nlist, unsigned int compute_virial, int tpp>
+template<class evaluator, unsigned int compute_virial, int tpp>
 struct TersoffComputeKernel
     {
     //! Launcher for the tersoff triplet kernel
@@ -731,24 +663,9 @@ struct TersoffComputeKernel
             static unsigned int max_block_size = UINT_MAX;
             static unsigned int kernel_shared_bytes = 0;
             if (max_block_size == UINT_MAX)
-                get_max_block_size(gpu_compute_triplet_forces_kernel<evaluator, use_gmem_nlist, compute_virial, tpp>, pair_args, max_block_size, kernel_shared_bytes);
+                get_max_block_size(gpu_compute_triplet_forces_kernel<evaluator, compute_virial, tpp>, pair_args, max_block_size, kernel_shared_bytes);
             int run_block_size = min(pair_args.block_size, max_block_size);
 
-            // bind to texture
-            if (pair_args.compute_capability < 35)
-                {
-                pdata_pos_tex.normalized = false;
-                pdata_pos_tex.filterMode = cudaFilterModePoint;
-                cudaBindTexture(0, pdata_pos_tex, pair_args.d_pos, sizeof(Scalar4) * (pair_args.N+pair_args.Nghosts));
-
-                if (pair_args.size_nlist <= (unsigned int) pair_args.devprop.maxTexture1DLinear)
-                    {
-                    nlist_tex.normalized = false;
-                    nlist_tex.filterMode = cudaFilterModePoint;
-                    cudaBindTexture(0, nlist_tex, pair_args.d_nlist, sizeof(unsigned int) * pair_args.size_nlist);
-                    }
-                }
-
             // size shared bytes
             Index2D typpair_idx(pair_args.ntypes);
             unsigned int shared_bytes = (sizeof(Scalar) + sizeof(typename evaluator::param_type))
@@ -772,7 +689,7 @@ struct TersoffComputeKernel
             dim3 grid( pair_args.N / (run_block_size/pair_args.tpp) + 1, 1, 1);
             dim3 threads(run_block_size, 1, 1);
 
-            gpu_compute_triplet_forces_kernel<evaluator, use_gmem_nlist, compute_virial, tpp>
+            gpu_compute_triplet_forces_kernel<evaluator, compute_virial, tpp>
               <<<grid, threads, shared_bytes>>>(pair_args.d_force,
                                                 pair_args.N,
                                                 pair_args.d_virial,
@@ -789,14 +706,14 @@ struct TersoffComputeKernel
             }
         else
             {
-            TersoffComputeKernel<evaluator, use_gmem_nlist, compute_virial, tpp/2>::launch(pair_args, d_params);
+            TersoffComputeKernel<evaluator, compute_virial, tpp/2>::launch(pair_args, d_params);
             }
         }
     };
 
 //! Template specialization to do nothing for the tpp = 0 case
-template<class evaluator, unsigned int use_gmem_nlist,  unsigned int compute_virial>
-struct TersoffComputeKernel<evaluator, use_gmem_nlist, compute_virial, 0>
+template<class evaluator, unsigned int compute_virial>
+struct TersoffComputeKernel<evaluator, compute_virial, 0>
     {
     static void launch(const tersoff_args_t& pair_args, const typename evaluator::param_type *d_params)
         {
@@ -822,25 +739,11 @@ cudaError_t gpu_compute_triplet_forces(const tersoff_args_t& pair_args,
     // compute the new forces
     if (!pair_args.compute_virial)
         {
-        if (pair_args.compute_capability < 35 && pair_args.size_nlist > (unsigned int) pair_args.devprop.maxTexture1DLinear)
-            {
-            TersoffComputeKernel<evaluator, 1, 0, gpu_tersoff_max_tpp>::launch(pair_args, d_params);
-            }
-        else
-            {
-            TersoffComputeKernel<evaluator, 0, 0, gpu_tersoff_max_tpp>::launch(pair_args, d_params);
-            }
+        TersoffComputeKernel<evaluator, 0, gpu_tersoff_max_tpp>::launch(pair_args, d_params);
         }
     else
         {
-        if (pair_args.compute_capability < 35 && pair_args.size_nlist > (unsigned int) pair_args.devprop.maxTexture1DLinear)
-            {
-            TersoffComputeKernel<evaluator, 1, 1, gpu_tersoff_max_tpp>::launch(pair_args, d_params);
-            }
-        else
-            {
-            TersoffComputeKernel<evaluator, 0, 1, gpu_tersoff_max_tpp>::launch(pair_args, d_params);
-            }
+        TersoffComputeKernel<evaluator, 1, gpu_tersoff_max_tpp>::launch(pair_args, d_params);
         }
     return cudaSuccess;
     }
diff --git a/hoomd/md/PotentialTersoffGPU.h b/hoomd/md/PotentialTersoffGPU.h
index f9fd2eab5d..74e753a779 100644
--- a/hoomd/md/PotentialTersoffGPU.h
+++ b/hoomd/md/PotentialTersoffGPU.h
@@ -169,7 +169,6 @@ void PotentialTersoffGPU< evaluator, gpu_cgpf >::computeForces(unsigned int time
                             this->m_pdata->getNTypes(),
                             block_size,
                             threads_per_particle,
-                            this->m_exec_conf->getComputeCapability()/10,
                             this->m_exec_conf->dev_prop),
                             d_params.data);
 
diff --git a/hoomd/md/TableAngleForceComputeGPU.cc b/hoomd/md/TableAngleForceComputeGPU.cc
index c9f544f72a..617c369a06 100644
--- a/hoomd/md/TableAngleForceComputeGPU.cc
+++ b/hoomd/md/TableAngleForceComputeGPU.cc
@@ -82,8 +82,7 @@ void TableAngleForceComputeGPU::computeForces(unsigned int timestep)
                              d_tables.data,
                              m_table_width,
                              m_table_value,
-                             m_tuner->getParam(),
-                             m_exec_conf->getComputeCapability());
+                             m_tuner->getParam());
         }
 
 
diff --git a/hoomd/md/TableAngleForceGPU.cu b/hoomd/md/TableAngleForceGPU.cu
index 131010d9f4..b03fb9db55 100644
--- a/hoomd/md/TableAngleForceGPU.cu
+++ b/hoomd/md/TableAngleForceGPU.cu
@@ -16,10 +16,6 @@
     \brief Defines GPU kernel code for calculating the table angle forces. Used by TableAngleForceComputeGPU.
 */
 
-
-//! Texture for reading table values
-scalar2_tex_t tables_tex;
-
 /*!  This kernel is called to calculate the table angle forces on all triples this is defined or
 
     \param d_force Device memory to write computed forces
@@ -38,10 +34,6 @@ scalar2_tex_t tables_tex;
     \param delta_th angle delta of the table
 
     See TableAngleForceCompute for information on the memory layout.
-
-    \b Details:
-    * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests
-      at a later date may result in this changing.
 */
 __global__ void gpu_compute_table_angle_forces_kernel(Scalar4* d_force,
                                      Scalar* d_virial,
@@ -153,8 +145,8 @@ __global__ void gpu_compute_table_angle_forces_kernel(Scalar4* d_force,
 
         // compute index into the table and read in values
         unsigned int value_i = value_f;
-        Scalar2 VT0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_angle_type));
-        Scalar2 VT1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_angle_type));
+        Scalar2 VT0 = __ldg(d_tables + table_value(value_i, cur_angle_type));
+        Scalar2 VT1 = __ldg(d_tables + table_value(value_i+1, cur_angle_type));
         // unpack the data
         Scalar V0 = VT0.x;
         Scalar V1 = VT1.x;
@@ -257,8 +249,7 @@ cudaError_t gpu_compute_table_angle_forces(Scalar4* d_force,
                                      const Scalar2 *d_tables,
                                      const unsigned int table_width,
                                      const Index2D &table_value,
-                                     const unsigned int block_size,
-                                     const unsigned int compute_capability)
+                                     const unsigned int block_size)
     {
     assert(d_tables);
     assert(table_width > 1);
@@ -280,16 +271,6 @@ cudaError_t gpu_compute_table_angle_forces(Scalar4* d_force,
     dim3 grid( N / run_block_size + 1, 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the tables texture on pre sm35 arches
-    if (compute_capability < 350)
-        {
-        tables_tex.normalized = false;
-        tables_tex.filterMode = cudaFilterModePoint;
-        cudaError_t error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_value.getNumElements());
-        if (error != cudaSuccess)
-            return error;
-        }
-
     Scalar delta_th = Scalar(M_PI)/(Scalar)(table_width - 1);
 
     gpu_compute_table_angle_forces_kernel<<< grid, threads >>>
diff --git a/hoomd/md/TableAngleForceGPU.cuh b/hoomd/md/TableAngleForceGPU.cuh
index f75cda7a1c..8f107e3953 100644
--- a/hoomd/md/TableAngleForceGPU.cuh
+++ b/hoomd/md/TableAngleForceGPU.cuh
@@ -30,7 +30,6 @@ cudaError_t gpu_compute_table_angle_forces(Scalar4* d_force,
                                      const Scalar2 *d_tables,
                                      const unsigned int table_width,
                                      const Index2D &table_value,
-                                     const unsigned int block_size,
-                                     const unsigned int compute_capability);
+                                     const unsigned int block_size);
 
 #endif
diff --git a/hoomd/md/TableDihedralForceComputeGPU.cc b/hoomd/md/TableDihedralForceComputeGPU.cc
index c0c04b5479..0b18218414 100644
--- a/hoomd/md/TableDihedralForceComputeGPU.cc
+++ b/hoomd/md/TableDihedralForceComputeGPU.cc
@@ -83,8 +83,7 @@ void TableDihedralForceComputeGPU::computeForces(unsigned int timestep)
                              d_tables.data,
                              m_table_width,
                              m_table_value,
-                             m_tuner->getParam(),
-                             m_exec_conf->getComputeCapability());
+                             m_tuner->getParam());
         }
 
 
diff --git a/hoomd/md/TableDihedralForceGPU.cu b/hoomd/md/TableDihedralForceGPU.cu
index 90a8356bd6..f121371eaf 100644
--- a/hoomd/md/TableDihedralForceGPU.cu
+++ b/hoomd/md/TableDihedralForceGPU.cu
@@ -18,10 +18,6 @@
     \brief Defines GPU kernel code for calculating the table dihedral forces. Used by TableDihedralForceComputeGPU.
 */
 
-
-//! Texture for reading table values
-scalar2_tex_t tables_tex;
-
 /*!  This kernel is called to calculate the table dihedral forces on all triples this is defined or
 
     \param d_force Device memory to write computed forces
@@ -39,10 +35,6 @@ scalar2_tex_t tables_tex;
     \param delta_phi dihedral delta of the table
 
     See TableDihedralForceCompute for information on the memory layout.
-
-    \b Details:
-    * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests
-      at a later date may result in this changing.
 */
 __global__ void gpu_compute_table_dihedral_forces_kernel(Scalar4* d_force,
                                      Scalar* d_virial,
@@ -206,8 +198,8 @@ __global__ void gpu_compute_table_dihedral_forces_kernel(Scalar4* d_force,
 
         // compute index into the table and read in values
         unsigned int value_i = value_f;
-        Scalar2 VT0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_dihedral_type));
-        Scalar2 VT1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_dihedral_type));
+        Scalar2 VT0 = __ldg(d_tables + table_value(value_i, cur_dihedral_type));
+        Scalar2 VT1 = __ldg(d_tables + table_value(value_i+1, cur_dihedral_type));
         // unpack the data
         Scalar V0 = VT0.x;
         Scalar V1 = VT1.x;
@@ -318,8 +310,7 @@ cudaError_t gpu_compute_table_dihedral_forces(Scalar4* d_force,
                                      const Scalar2 *d_tables,
                                      const unsigned int table_width,
                                      const Index2D &table_value,
-                                     const unsigned int block_size,
-                                     const unsigned int compute_capability)
+                                     const unsigned int block_size)
     {
     assert(d_tables);
     assert(table_width > 1);
@@ -341,16 +332,6 @@ cudaError_t gpu_compute_table_dihedral_forces(Scalar4* d_force,
     dim3 grid( N / run_block_size + 1, 1, 1);
     dim3 threads(run_block_size, 1, 1);
 
-    // bind the tables texture on pre sm35 devices
-    if (compute_capability < 350)
-        {
-        tables_tex.normalized = false;
-        tables_tex.filterMode = cudaFilterModePoint;
-        cudaError_t error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_value.getNumElements());
-        if (error != cudaSuccess)
-            return error;
-        }
-
     Scalar delta_phi = Scalar(2.0*M_PI)/(Scalar)(table_width - 1);
 
     gpu_compute_table_dihedral_forces_kernel<<< grid, threads>>>
diff --git a/hoomd/md/TableDihedralForceGPU.cuh b/hoomd/md/TableDihedralForceGPU.cuh
index 8bf50161e7..bcab1e48db 100644
--- a/hoomd/md/TableDihedralForceGPU.cuh
+++ b/hoomd/md/TableDihedralForceGPU.cuh
@@ -30,7 +30,6 @@ cudaError_t gpu_compute_table_dihedral_forces(Scalar4* d_force,
                                      const Scalar2 *d_tables,
                                      const unsigned int table_width,
                                      const Index2D &table_value,
-                                     const unsigned int block_size,
-                                     const unsigned int compute_capability);
+                                     const unsigned int block_size);
 
 #endif
diff --git a/hoomd/md/TablePotentialGPU.cc b/hoomd/md/TablePotentialGPU.cc
index 459b8bd8bc..4da517e897 100644
--- a/hoomd/md/TablePotentialGPU.cc
+++ b/hoomd/md/TablePotentialGPU.cc
@@ -96,8 +96,6 @@ void TablePotentialGPU::computeForces(unsigned int timestep)
                              m_ntypes,
                              m_table_width,
                              m_tuner->getParam(),
-                             m_exec_conf->getComputeCapability(),
-                             m_exec_conf->dev_prop.maxTexture1DLinear,
                              m_pdata->getGPUPartition());
 
     if(m_exec_conf->isCUDAErrorCheckingEnabled())
diff --git a/hoomd/md/TablePotentialGPU.cu b/hoomd/md/TablePotentialGPU.cu
index 75dbed0e81..33bc2697a3 100644
--- a/hoomd/md/TablePotentialGPU.cu
+++ b/hoomd/md/TablePotentialGPU.cu
@@ -15,15 +15,6 @@
     \brief Defines GPU kernel code for calculating the table pair forces. Used by TablePotentialGPU.
 */
 
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-
-//! Texture for reading the neighborlist
-texture<unsigned int, 1, cudaReadModeElementType> nlist_tex;
-
-//! Texture for reading table values
-scalar2_tex_t tables_tex;
-
 /*!  This kernel is called to calculate the table pair forces on all N particles
 
     \param d_force Device memory to write computed forces
@@ -44,12 +35,7 @@ scalar2_tex_t tables_tex;
 
     \tparam use_gmem_nlist When non-zero, the neighbor list is read out of global memory. When zero, textures or __ldg
                            is used depending on architecture.
-
-    \b Details:
-    * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests
-      at a later date may result in this changing.
 */
-template<unsigned char use_gmem_nlist>
 __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force,
                                                 Scalar* d_virial,
                                                 const unsigned virial_pitch,
@@ -92,7 +78,7 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force,
     const unsigned int head_idx = d_head_list[idx];
 
     // read in the position of our particle. Texture reads of Scalar4's are faster than global reads on compute 1.0 hardware
-    Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx);
+    Scalar4 postype = __ldg(d_pos + idx);
     Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z);
     unsigned int typei = __scalar_as_int(postype.w);
 
@@ -108,14 +94,7 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force,
     // prefetch neighbor index
     unsigned int cur_neigh = 0;
     unsigned int next_neigh(0);
-    if (use_gmem_nlist)
-        {
-        next_neigh = d_nlist[head_idx];
-        }
-    else
-        {
-        next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx);
-        }
+    next_neigh = __ldg(d_nlist + head_idx);
 
     // loop over neighbors
     for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++)
@@ -123,17 +102,10 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force,
         // read the current neighbor index
         // prefetch the next value and set the current one
         cur_neigh = next_neigh;
-        if (use_gmem_nlist)
-            {
-            next_neigh = d_nlist[head_idx + neigh_idx + 1];
-            }
-        else
-            {
-            next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx+1);
-            }
+        next_neigh = __ldg(d_nlist + head_idx + neigh_idx+1);
 
         // get the neighbor's position
-        Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh);
+        Scalar4 neigh_postype = __ldg(d_pos + cur_neigh);
         Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z);
 
         // calculate dr (with periodic boundary conditions)
@@ -161,8 +133,8 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force,
 
             // compute index into the table and read in values
             unsigned int value_i = floor(value_f);
-            Scalar2 VF0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_table_index));
-            Scalar2 VF1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_table_index));
+            Scalar2 VF0 = __ldg(d_tables + table_value(value_i, cur_table_index));
+            Scalar2 VF1 = __ldg(d_tables + table_value(value_i+1, cur_table_index));
 
             // unpack the data
             Scalar V0 = VF0.x;
@@ -227,8 +199,6 @@ __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force,
     \param ntypes Number of particle types in the system
     \param table_width Number of points in each table
     \param block_size Block size at which to run the kernel
-    \param compute_capability Compute capability of the device (200, 300, 350)
-    \param max_tex1d_width Maximum width of a linear 1d texture
 
     \note This is just a kernel driver. See gpu_compute_table_forces_kernel for full documentation.
 */
@@ -248,8 +218,6 @@ cudaError_t gpu_compute_table_forces(Scalar4* d_force,
                                      const unsigned int ntypes,
                                      const unsigned int table_width,
                                      const unsigned int block_size,
-                                     const unsigned int compute_capability,
-                                     const unsigned int max_tex1d_width,
                                      const GPUPartition& gpu_partition)
     {
     assert(d_params);
@@ -265,99 +233,37 @@ cudaError_t gpu_compute_table_forces(Scalar4* d_force,
         {
         auto range = gpu_partition.getRangeAndSetGPU(idev);
 
-        // texture bind
-        if (compute_capability < 350)
+        static unsigned int max_block_size = UINT_MAX;
+        if (max_block_size == UINT_MAX)
             {
-            // bind the pdata position texture
-            pdata_pos_tex.normalized = false;
-            pdata_pos_tex.filterMode = cudaFilterModePoint;
-            cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4) * (N+n_ghost));
-            if (error != cudaSuccess)
-                return error;
-
-            if (size_nlist <= max_tex1d_width)
-                {
-                nlist_tex.normalized = false;
-                nlist_tex.filterMode = cudaFilterModePoint;
-                error = cudaBindTexture(0, nlist_tex, d_nlist, sizeof(unsigned int)*size_nlist);
-                if (error != cudaSuccess)
-                    return error;
-                }
-
-            // bind the tables texture
-            tables_tex.normalized = false;
-            tables_tex.filterMode = cudaFilterModePoint;
-            error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_width * table_index.getNumElements());
-            if (error != cudaSuccess)
-                return error;
+            cudaFuncAttributes attr;
+            cudaFuncGetAttributes(&attr, gpu_compute_table_forces_kernel);
+            max_block_size = attr.maxThreadsPerBlock;
             }
 
-        if (compute_capability < 350 && size_nlist > max_tex1d_width)
-            { // use global memory when the neighbor list must be texture bound, but exceeds the max size of a texture
-            static unsigned int max_block_size = UINT_MAX;
-            if (max_block_size == UINT_MAX)
-                {
-                cudaFuncAttributes attr;
-                cudaFuncGetAttributes(&attr, gpu_compute_table_forces_kernel<1>);
-                max_block_size = attr.maxThreadsPerBlock;
-                }
-
-            unsigned int run_block_size = min(block_size, max_block_size);
-
-            // setup the grid to run the kernel
-            dim3 grid( (range.second-range.first) / run_block_size + 1, 1, 1);
-            dim3 threads(run_block_size, 1, 1);
-
-            gpu_compute_table_forces_kernel<1><<< grid, threads, sizeof(Scalar4)*table_index.getNumElements() >>>(d_force,
-                                                                                                               d_virial,
-                                                                                                               virial_pitch,
-                                                                                                               range.second-range.first,
-                                                                                                               d_pos,
-                                                                                                               box,
-                                                                                                               d_n_neigh,
-                                                                                                               d_nlist,
-                                                                                                               d_head_list,
-                                                                                                               d_tables,
-                                                                                                               d_params,
-                                                                                                               ntypes,
-                                                                                                               table_width,
-                                                                                                               range.first
-                                                                                                               );
-            }
-        else
-            {
-            static unsigned int max_block_size = UINT_MAX;
-            if (max_block_size == UINT_MAX)
-                {
-                cudaFuncAttributes attr;
-                cudaFuncGetAttributes(&attr, gpu_compute_table_forces_kernel<0>);
-                max_block_size = attr.maxThreadsPerBlock;
-                }
-
-            unsigned int run_block_size = min(block_size, max_block_size);
-
-            // index calculation helper
-            Index2DUpperTriangular table_index(ntypes);
-
-            // setup the grid to run the kernel
-            dim3 grid( (range.second-range.first) / run_block_size + 1, 1, 1);
-            dim3 threads(run_block_size, 1, 1);
-
-            gpu_compute_table_forces_kernel<0><<< grid, threads, sizeof(Scalar4)*table_index.getNumElements() >>>(d_force,
-                                                                                                               d_virial,
-                                                                                                               virial_pitch,
-                                                                                                               range.second-range.first,
-                                                                                                               d_pos,
-                                                                                                               box,
-                                                                                                               d_n_neigh,
-                                                                                                               d_nlist,
-                                                                                                               d_head_list,
-                                                                                                               d_tables,
-                                                                                                               d_params,
-                                                                                                               ntypes,
-                                                                                                               table_width,
-                                                                                                               range.first);
-            }
+        unsigned int run_block_size = min(block_size, max_block_size);
+
+        // index calculation helper
+        Index2DUpperTriangular table_index(ntypes);
+
+        // setup the grid to run the kernel
+        dim3 grid( (range.second-range.first) / run_block_size + 1, 1, 1);
+        dim3 threads(run_block_size, 1, 1);
+
+        gpu_compute_table_forces_kernel<<< grid, threads, sizeof(Scalar4)*table_index.getNumElements() >>>(d_force,
+                                                                                                           d_virial,
+                                                                                                           virial_pitch,
+                                                                                                           range.second-range.first,
+                                                                                                           d_pos,
+                                                                                                           box,
+                                                                                                           d_n_neigh,
+                                                                                                           d_nlist,
+                                                                                                           d_head_list,
+                                                                                                           d_tables,
+                                                                                                           d_params,
+                                                                                                           ntypes,
+                                                                                                           table_width,
+                                                                                                           range.first);
         }
     return cudaSuccess;
     }
diff --git a/hoomd/md/TablePotentialGPU.cuh b/hoomd/md/TablePotentialGPU.cuh
index 28beeda291..f33fa1bb74 100644
--- a/hoomd/md/TablePotentialGPU.cuh
+++ b/hoomd/md/TablePotentialGPU.cuh
@@ -33,8 +33,6 @@ cudaError_t gpu_compute_table_forces(Scalar4* d_force,
                                      const unsigned int ntypes,
                                      const unsigned int table_width,
                                      const unsigned int block_size,
-                                     const unsigned int compute_capability,
-                                     const unsigned int max_tex1d_width,
                                      const GPUPartition& gpu_partition);
 
 #endif
diff --git a/hoomd/metal/EAMForceComputeGPU.cc b/hoomd/metal/EAMForceComputeGPU.cc
index 059210fe78..cc6824788a 100644
--- a/hoomd/metal/EAMForceComputeGPU.cc
+++ b/hoomd/metal/EAMForceComputeGPU.cc
@@ -102,8 +102,7 @@ void EAMForceComputeGPU::computeForces(unsigned int timestep)
     eam_data.block_size = m_tuner->getParam();
     gpu_compute_eam_tex_inter_forces(d_force.data, d_virial.data, m_virial.getPitch(), m_pdata->getN(), d_pos.data, box,
             d_n_neigh.data, d_nlist.data, d_head_list.data, this->m_nlist->getNListArray().getPitch(), eam_data,
-            d_dFdP.data, d_F.data, d_rho.data, d_rphi.data, d_dF.data, d_drho.data, d_drphi.data,
-            m_exec_conf->getComputeCapability() / 10, m_exec_conf->dev_prop.maxTexture1DLinear);
+            d_dFdP.data, d_F.data, d_rho.data, d_rphi.data, d_dF.data, d_drho.data, d_drphi.data);
 
     if (m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
diff --git a/hoomd/metal/EAMForceGPU.cu b/hoomd/metal/EAMForceGPU.cu
index ddfaca2e9d..b4cd7c31e0 100644
--- a/hoomd/metal/EAMForceGPU.cu
+++ b/hoomd/metal/EAMForceGPU.cu
@@ -13,25 +13,10 @@
  \brief Defines GPU kernel code for calculating the EAM forces. Used by EAMForceComputeGPU.
  */
 
-//! Texture for reading particle positions
-scalar4_tex_t pdata_pos_tex;
-//! Texture for reading the neighbor list
-texture<unsigned int, 1, cudaReadModeElementType> nlist_tex;
-//! Texture for reading potential
-scalar4_tex_t tex_F;
-scalar4_tex_t tex_rho;
-scalar4_tex_t tex_rphi;
-scalar4_tex_t tex_dF;
-scalar4_tex_t tex_drho;
-scalar4_tex_t tex_drphi;
-//! Texture for dF/dP
-scalar_tex_t tex_dFdP;
-
 //! Storage space for EAM parameters on the GPU
 __constant__ EAMTexInterData eam_data_ti;
 
 //! Kernel for computing EAM forces on the GPU
-template<unsigned char use_gmem_nlist>
 __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned int virial_pitch, const unsigned int N,
         const Scalar4 *d_pos, BoxDim box, const unsigned int *d_n_neigh, const unsigned int *d_nlist,
         const unsigned int *d_head_list, const Scalar4 *d_F, const Scalar4 *d_rho, const Scalar4 *d_rphi,
@@ -49,7 +34,7 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned
     const unsigned int head_idx = d_head_list[idx];
 
     // read in the position of our particle.
-    Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx);
+    Scalar4 postype = __ldg(d_pos + idx);
     Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z);
 
     // index and remainder
@@ -65,14 +50,8 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned
     // prefetch neighbor index
     int cur_neigh = 0;
     int next_neigh(0);
-    if (use_gmem_nlist)
-        {
-        next_neigh = d_nlist[head_idx];
-        }
-    else
-        {
-        next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx);
-        }
+    next_neigh = __ldg(d_nlist + head_idx);
+
     int typei = __scalar_as_int(postype.w);
 
     // loop over neighbors
@@ -89,17 +68,10 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned
         // read the current neighbor index
         // prefetch the next value and set the current one
         cur_neigh = next_neigh;
-        if (use_gmem_nlist)
-            {
-            next_neigh = d_nlist[head_idx + neigh_idx + 1];
-            }
-        else
-            {
-            next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + 1);
-            }
+        next_neigh = __ldg(d_nlist + head_idx + neigh_idx + 1);
 
         // get the neighbor's position
-        Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh);
+        Scalar4 neigh_postype = __ldg(d_pos + cur_neigh);
         Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z);
 
         // calculate dr (with periodic boundary conditions)
@@ -120,7 +92,7 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned
             remainder = position - int_position;
             // calculate P = sum{rho}
             idxs = int_position + nr * (typej * ntypes + typei);
-            v = texFetchScalar4(d_rho, tex_rho, idxs);
+            v = __ldg(d_rho + idxs);
             atomElectronDensity += v.w + v.z * remainder + v.y * remainder * remainder
             + v.x * remainder * remainder * remainder;
             }
@@ -133,8 +105,8 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned
     remainder = position - int_position;
 
     idxs = int_position + typei * nrho;
-    dv = texFetchScalar4(d_dF, tex_dF, idxs);
-    v = texFetchScalar4(d_F, tex_F, idxs);
+    dv = __ldg(d_dF + idxs);
+    v = __ldg(d_F + idxs);
     // compute dF / dP
     d_dFdP[idx] = dv.z + dv.y * remainder + dv.x * remainder * remainder;
     // compute embedded energy F(P), sum up each particle
@@ -145,7 +117,6 @@ __global__ void gpu_kernel_1(Scalar4 *d_force, Scalar *d_virial, const unsigned
     }
 
 //! Second stage kernel for computing EAM forces on the GPU
-template<unsigned char use_gmem_nlist>
 __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned int virial_pitch, const unsigned int N,
         const Scalar4 *d_pos, BoxDim box, const unsigned int *d_n_neigh, const unsigned int *d_nlist,
         const unsigned int *d_head_list, const Scalar4 *d_F, const Scalar4 *d_rho, const Scalar4 *d_rphi,
@@ -163,7 +134,7 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned
     const unsigned int head_idx = d_head_list[idx];
 
     // read in the position of our particle. Texture reads of Scalar4's are faster than global reads
-    Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx);
+    Scalar4 postype = __ldg(d_pos + idx);
     Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z);
     int typei = __scalar_as_int(postype.w);
 
@@ -177,14 +148,8 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned
     // prefetch neighbor index
     int cur_neigh = 0;
     int next_neigh(0);
-    if (use_gmem_nlist)
-        {
-        next_neigh = d_nlist[head_idx];
-        }
-    else
-        {
-        next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx);
-        }
+    next_neigh = __ldg(d_nlist + head_idx);
+
     //Scalar4 force = force_data.force[idx];
     Scalar4 force = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
     //force.w = force_data.force[idx].w;
@@ -202,21 +167,14 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned
     int nr = eam_data_ti.nr;
     Scalar rdr = eam_data_ti.rdr;
     Scalar r_cutsq = eam_data_ti.r_cutsq;
-    Scalar d_dFdPidx = texFetchScalar(d_dFdP, tex_dFdP, idx);
+    Scalar d_dFdPidx = __ldg(d_dFdP + idx);
     for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++)
         {
         cur_neigh = next_neigh;
-        if (use_gmem_nlist)
-            {
-            next_neigh = d_nlist[head_idx + neigh_idx + 1];
-            }
-        else
-            {
-            next_neigh = texFetchUint(d_nlist, nlist_tex, head_idx + neigh_idx + 1);
-            }
+        next_neigh = __ldg(d_nlist + head_idx + neigh_idx + 1);
 
         // get the neighbor's position
-        Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh);
+        Scalar4 neigh_postype = __ldg(d_pos + cur_neigh);
         Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z);
 
         // calculate dr (with periodic boundary conditions)
@@ -245,8 +203,8 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned
         (int) (0.5 * (2 * ntypes - typei - 1) * typei + typej) * nr;
 
         idxs = int_position + shift;
-        v = texFetchScalar4(d_rphi, tex_rphi, idxs);
-        dv = texFetchScalar4(d_drphi, tex_drphi, idxs);
+        v = __ldg(d_rphi + idxs);
+        dv = __ldg(d_drphi + idxs);
         // aspair_potential = r * phi
         Scalar aspair_potential = v.w + v.z * remainder + v.y * remainder * remainder
         + v.x * remainder * remainder * remainder;
@@ -258,14 +216,14 @@ __global__ void gpu_kernel_2(Scalar4 *d_force, Scalar *d_virial, const unsigned
         Scalar derivativePhi = (derivative_pair_potential - pair_eng) * inverseR;
         // derivativeRhoI = drho / dr of i
         idxs = int_position + typei * ntypes * nr + typej * nr;
-        dv = texFetchScalar4(d_drho, tex_drho, idxs);
+        dv = __ldg(d_drho + idxs);
         Scalar derivativeRhoI = dv.z + dv.y * remainder + dv.x * remainder * remainder;
         // derivativeRhoJ = drho / dr of j
         idxs = int_position + typej * ntypes * nr + typei * nr;
-        dv = texFetchScalar4(d_drho, tex_drho, idxs);
+        dv = __ldg(d_drho + idxs);
         Scalar derivativeRhoJ = dv.z + dv.y * remainder + dv.x * remainder * remainder;
         // fullDerivativePhi = dF/dP * drho / dr for j + dF/dP * drho / dr for j + phi
-        Scalar d_dFdPcur = texFetchScalar(d_dFdP, tex_dFdP, cur_neigh);
+        Scalar d_dFdPcur = __ldg(d_dFdP + cur_neigh);
         Scalar fullDerivativePhi = d_dFdPidx * derivativeRhoJ + d_dFdPcur * derivativeRhoI + derivativePhi;
         // compute forces
         pairForce = -fullDerivativePhi * inverseR;
@@ -301,141 +259,39 @@ cudaError_t gpu_compute_eam_tex_inter_forces(Scalar4 *d_force, Scalar *d_virial,
         const unsigned int N, const Scalar4 *d_pos, const BoxDim &box, const unsigned int *d_n_neigh,
         const unsigned int *d_nlist, const unsigned int *d_head_list, const unsigned int size_nlist,
         const EAMTexInterData &eam_data, Scalar *d_dFdP, const Scalar4 *d_F, const Scalar4 *d_rho,
-        const Scalar4 *d_rphi, const Scalar4 *d_dF, const Scalar4 *d_drho, const Scalar4 *d_drphi,
-        const unsigned int compute_capability, const unsigned int max_tex1d_width)
+        const Scalar4 *d_rphi, const Scalar4 *d_dF, const Scalar4 *d_drho, const Scalar4 *d_drphi)
     {
 
-    cudaError_t error;
-
-    // bind the texture
-    if (compute_capability < 350 && size_nlist <= max_tex1d_width)
-        {
-        nlist_tex.normalized = false;
-        nlist_tex.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, nlist_tex, d_nlist, sizeof(unsigned int) * size_nlist);
-        if (error != cudaSuccess)
-            return error;
-        }
-
-    if (compute_capability < 350)
-        {
-        tex_F.normalized = false;
-        tex_F.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, tex_F, d_F, sizeof(Scalar4) * eam_data.nrho * eam_data.ntypes);
-        if (error != cudaSuccess)
-            return error;
-
-        tex_dF.normalized = false;
-        tex_dF.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, tex_dF, d_dF, sizeof(Scalar4) * eam_data.nrho * eam_data.ntypes);
-        if (error != cudaSuccess)
-            return error;
-
-        tex_rho.normalized = false;
-        tex_rho.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, tex_rho, d_rho, sizeof(Scalar4) * eam_data.nrho * eam_data.ntypes * eam_data.ntypes);
-        if (error != cudaSuccess)
-            return error;
-
-        tex_drho.normalized = false;
-        tex_drho.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, tex_drho, d_drho,
-                sizeof(Scalar4) * eam_data.nrho * eam_data.ntypes * eam_data.ntypes);
-        if (error != cudaSuccess)
-            return error;
-
-        tex_rphi.normalized = false;
-        tex_rphi.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, tex_rphi, d_rphi,
-                sizeof(Scalar4) * (int) (0.5 * eam_data.nr * (eam_data.ntypes + 1) * eam_data.ntypes));
-        if (error != cudaSuccess)
-            return error;
-
-        tex_drphi.normalized = false;
-        tex_drphi.filterMode = cudaFilterModePoint;
-        error = cudaBindTexture(0, tex_drphi, d_drphi,
-                sizeof(Scalar4) * (int) (0.5 * eam_data.nr * (eam_data.ntypes + 1) * eam_data.ntypes));
-        if (error != cudaSuccess)
-            return error;
-        }
-
-    pdata_pos_tex.normalized = false;
-    pdata_pos_tex.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4) * N);
-    if (error != cudaSuccess)
-        return error;
-
-    tex_dFdP.normalized = false;
-    tex_dFdP.filterMode = cudaFilterModePoint;
-    error = cudaBindTexture(0, tex_dFdP, d_dFdP, sizeof(Scalar) * N);
-    if (error != cudaSuccess)
-        return error;
-
     // run the kernel
     cudaMemcpyToSymbol(eam_data_ti, &eam_data, sizeof(EAMTexInterData));
 
-    if (compute_capability < 350 && size_nlist > max_tex1d_width)
-        {
-
-        static unsigned int max_block_size_1 = UINT_MAX;
-        static unsigned int max_block_size_2 = UINT_MAX;
+    static unsigned int max_block_size_1 = UINT_MAX;
+    static unsigned int max_block_size_2 = UINT_MAX;
 
-        cudaFuncAttributes attr1;
-        cudaFuncGetAttributes(&attr1, gpu_kernel_1<1>);
+    cudaFuncAttributes attr1;
+    cudaFuncGetAttributes(&attr1, gpu_kernel_1);
 
-        cudaFuncAttributes attr2;
-        cudaFuncGetAttributes(&attr2, gpu_kernel_2<1>);
+    cudaFuncAttributes attr2;
+    cudaFuncGetAttributes(&attr2, gpu_kernel_2);
 
-        max_block_size_1 = attr1.maxThreadsPerBlock;
-        max_block_size_2 = attr2.maxThreadsPerBlock;
+    max_block_size_1 = attr1.maxThreadsPerBlock;
+    max_block_size_2 = attr2.maxThreadsPerBlock;
 
-        unsigned int run_block_size_1 = min(eam_data.block_size, max_block_size_1);
-        unsigned int run_block_size_2 = min(eam_data.block_size, max_block_size_2);
+    unsigned int run_block_size_1 = min(eam_data.block_size, max_block_size_1);
+    unsigned int run_block_size_2 = min(eam_data.block_size, max_block_size_2);
 
-        // setup the grid to run the kernel
+    // setup the grid to run the kernel
 
-        dim3 grid_1((int) ceil((double) N / (double) run_block_size_1), 1, 1);
-        dim3 threads_1(run_block_size_1, 1, 1);
+    dim3 grid_1((int) ceil((double) N / (double) run_block_size_1), 1, 1);
+    dim3 threads_1(run_block_size_1, 1, 1);
 
-        dim3 grid_2((int) ceil((double) N / (double) run_block_size_2), 1, 1);
-        dim3 threads_2(run_block_size_2, 1, 1);
+    dim3 grid_2((int) ceil((double) N / (double) run_block_size_2), 1, 1);
+    dim3 threads_2(run_block_size_2, 1, 1);
 
-        gpu_kernel_1<1> <<<grid_1, threads_1>>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist,
-                d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP);
-        gpu_kernel_2<1> <<<grid_2, threads_2>>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist,
-                d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP);
-        }
-    else
-        {
-
-        static unsigned int max_block_size_1 = UINT_MAX;
-        static unsigned int max_block_size_2 = UINT_MAX;
-
-        cudaFuncAttributes attr1;
-        cudaFuncGetAttributes(&attr1, gpu_kernel_1<0>);
-
-        cudaFuncAttributes attr2;
-        cudaFuncGetAttributes(&attr2, gpu_kernel_2<0>);
-
-        max_block_size_1 = attr1.maxThreadsPerBlock;
-        max_block_size_2 = attr2.maxThreadsPerBlock;
-
-        unsigned int run_block_size_1 = min(eam_data.block_size, max_block_size_1);
-        unsigned int run_block_size_2 = min(eam_data.block_size, max_block_size_2);
-
-        // setup the grid to run the kernel
-
-        dim3 grid_1((int) ceil((double) N / (double) run_block_size_1), 1, 1);
-        dim3 threads_1(run_block_size_1, 1, 1);
-
-        dim3 grid_2((int) ceil((double) N / (double) run_block_size_2), 1, 1);
-        dim3 threads_2(run_block_size_2, 1, 1);
-
-        gpu_kernel_1<0> <<<grid_1, threads_1>>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist,
-                d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP);
-        gpu_kernel_2<0> <<<grid_2, threads_2>>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist,
-                d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP);
-        }
+    gpu_kernel_1<<<grid_1, threads_1>>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist,
+            d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP);
+    gpu_kernel_2<<<grid_2, threads_2>>>(d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist,
+            d_head_list, d_F, d_rho, d_rphi, d_dF, d_drho, d_drphi, d_dFdP);
 
     return cudaSuccess;
     }
diff --git a/hoomd/metal/EAMForceGPU.cuh b/hoomd/metal/EAMForceGPU.cuh
index 6303691fcf..d76744b1a3 100644
--- a/hoomd/metal/EAMForceGPU.cuh
+++ b/hoomd/metal/EAMForceGPU.cuh
@@ -35,7 +35,6 @@ cudaError_t gpu_compute_eam_tex_inter_forces(Scalar4* d_force, Scalar* d_virial,
         const unsigned int N, const Scalar4 *d_pos, const BoxDim& box, const unsigned int *d_n_neigh,
         const unsigned int *d_nlist, const unsigned int *d_head_list, const unsigned int size_nlist,
         const EAMTexInterData& eam_data, Scalar *d_dFdP, const Scalar4 *d_F, const Scalar4 *d_rho,
-        const Scalar4 *d_rphi, const Scalar4 *d_dF, const Scalar4 *d_drho, const Scalar4 *d_drphi,
-        const unsigned int compute_capability, const unsigned int max_tex1d_width);
+        const Scalar4 *d_rphi, const Scalar4 *d_dF, const Scalar4 *d_drho, const Scalar4 *d_drphi);
 
 #endif