diff --git a/core/src/main/scala/org/apache/spark/ColumnPartitionData.scala b/core/src/main/scala/org/apache/spark/ColumnPartitionData.scala
index 476c2ece45..5c53f0a4fe 100644
--- a/core/src/main/scala/org/apache/spark/ColumnPartitionData.scala
+++ b/core/src/main/scala/org/apache/spark/ColumnPartitionData.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark
 
-import jcuda.driver.CUmodule
-import jcuda.runtime.{cudaStream_t, cudaMemcpyKind, JCuda}
 import org.apache.spark.storage.{RDDBlockId, BlockId}
 
 import math._
@@ -31,11 +29,9 @@ import java.nio.{ByteBuffer, ByteOrder}
 import java.io.{ObjectInputStream, ObjectOutputStream}
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
-import org.apache.spark.unsafe.memory.MemoryBlock
 import org.apache.spark.util.IteratorFunctions._
 import org.apache.spark.util.Utils
-
-import jcuda.Pointer
+import org.apache.spark.unsafe.memory.Pointer
 
 import org.slf4j.Logger
 import org.slf4j.LoggerFactory
@@ -61,6 +57,7 @@ class ColumnPartitionData[T](
   private var refCounter = 1
 
   // TODO blockId can never be NULL, modify the testcase to pass valid blockId and remove (0,0).
+  val cudaManager = SparkEnv.get.cudaManager
   var blockId  : Option[BlockId] = Some(RDDBlockId(0, 0))
   def rddId : Int = blockId.getOrElse(RDDBlockId(0, 0)).asRDDId.get.rddId
   def cachedGPUPointers : HashMap[String, Pointer] =
@@ -194,7 +191,7 @@ class ColumnPartitionData[T](
     order.map(columnsByAccessors(_))
   }
 
-  private[spark] def orderedGPUPointers(order: Seq[String], stream : cudaStream_t):
+  private[spark] def orderedGPUPointers(order: Seq[String], devIx: Int):
     Vector[Pointer] = {
     var gpuPtrs = Vector[Pointer]()
     var gpuBlobs = Vector[Pointer]()
@@ -204,7 +201,7 @@ class ColumnPartitionData[T](
     val inPointers = orderedPointers(order)
     for ((col, name, cpuPtr) <- (inColumns, order, inPointers).zipped) {
       gpuPtrs = gpuPtrs :+ cachedGPUPointers.getOrElseUpdate(blockId.get + name, {
-        val gpuPtr = SparkEnv.get.cudaManager.allocGPUMemory(col.memoryUsage(size))
+        val gpuPtr = cudaManager.allocGPUMemory(col.memoryUsage(size))
         memCpys = memCpys :+ (gpuPtr, cpuPtr, col.memoryUsage(size))
         gpuPtr
       })
@@ -215,7 +212,7 @@ class ColumnPartitionData[T](
     for ((blob, name, cpuPtr) <-
       (inBlobBuffers, (1 to inBlobBuffers.length).map(_.toString), inBlobs).zipped) {
       gpuBlobs = gpuBlobs :+ cachedGPUPointers.getOrElseUpdate(blockId.get + name, {
-        val gpuPtr = SparkEnv.get.cudaManager.allocGPUMemory(blob.capacity())
+        val gpuPtr = cudaManager.allocGPUMemory(blob.capacity())
         memCpys = memCpys :+ (gpuPtr, cpuPtr, blob.capacity().toLong)
         gpuPtr
       })
@@ -233,8 +230,7 @@ class ColumnPartitionData[T](
     */
 
     for ((gpuPtr, cpuPtr, length) <- memCpys) {
-      JCuda.cudaMemcpyAsync(gpuPtr, cpuPtr, length,
-                            cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+      cudaManager.memcpyH2DASync(gpuPtr, cpuPtr, length, devIx)
     }
 
     gpuPtrs ++ gpuBlobs
@@ -244,7 +240,7 @@ class ColumnPartitionData[T](
     if (!gpuCache) {
       for ((name, ptr) <- cachedGPUPointers) {
         if (name.startsWith(blockId.get.toString)) {
-          SparkEnv.get.cudaManager.freeGPUMemory(ptr)
+          cudaManager.freeGPUMemory(ptr)
           cachedGPUPointers.remove(name)
         }
       }
diff --git a/core/src/main/scala/org/apache/spark/cuda/CUDAFunction.scala b/core/src/main/scala/org/apache/spark/cuda/CUDAFunction.scala
index 28d7211fb4..13a75077ab 100644
--- a/core/src/main/scala/org/apache/spark/cuda/CUDAFunction.scala
+++ b/core/src/main/scala/org/apache/spark/cuda/CUDAFunction.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+
 package org.apache.spark.cuda
 
 import org.apache.spark.storage.BlockId
@@ -25,20 +26,14 @@ import java.net.URL
 import java.nio.ByteBuffer
 import java.nio.file.{Files, Paths}
 
-import jcuda.Pointer
 import jcuda.driver.CUfunction
-import jcuda.driver.CUmodule
-import jcuda.driver.CUstream
-import jcuda.driver.JCudaDriver
-import jcuda.runtime.cudaStream_t
-import jcuda.runtime.cudaMemcpyKind
-import jcuda.runtime.JCuda
 
 import org.apache.commons.io.IOUtils
 import org.apache.spark.rdd.ExternalFunction
 import org.apache.spark.{PartitionData, ColumnPartitionData, ColumnPartitionSchema, SparkEnv,
   SparkException}
 import org.apache.spark.util.Utils
+import org.apache.spark.unsafe.memory.Pointer
 
 /**
  * A CUDA kernel wrapper. Contains CUDA module, information how to extract CUDA kernel from it and
@@ -86,28 +81,20 @@ class CUDAFunction(
       outputArraySizes: Seq[Long] = null,
       inputFreeVariables: Seq[Any] = null,
       blockId : Option[BlockId] = None): ColumnPartitionData[U] = {
+    val cudaManager = SparkEnv.get.cudaManager
     val outputSchema = ColumnPartitionSchema.schemaFor[U]
 
     // TODO add array size
     val memoryUsage = (if (in.gpuCached) 0 else in.memoryUsage) + outputSchema.memoryUsage(in.size)
 
-    val streamDevIx = SparkEnv.get.cudaManager.getStream(memoryUsage, in.gpuDevIx)
-    val stream = streamDevIx._1
+    val devIx = cudaManager.getDevice(memoryUsage, in.gpuDevIx)
     if (in.gpuCache) {
-      in.gpuDevIx = streamDevIx._2
+      in.gpuDevIx = devIx
     }
 
     // TODO cache the function if there is a chance that after a deserialization kernel gets called
     // multiple times - but only if no synchronization is needed for that
-    val module = resource match {
-      case url: URL =>
-        SparkEnv.get.cudaManager.cachedLoadModule(Left(url))
-      case (name: String, ptx: String)  =>
-        SparkEnv.get.cudaManager.cachedLoadModule(Right(name, ptx))
-      case _ => throw new SparkException("Unsupported resource type for CUDAFunction")
-    }
-    val function = new CUfunction
-    JCudaDriver.cuModuleGetFunction(function, module, kernelSignature)
+    val function = cudaManager.moduleGetFunction(resource, kernelSignature)
 
     val actualOutputSize = outputSize.getOrElse(in.size)
     val out = if (outputArraySizes == null) {
@@ -127,8 +114,8 @@ class CUDAFunction(
         val outColumns = out.schema.orderedColumns(outputColumnsOrder)
         for (col <- outColumns) {
           val size = col.memoryUsage(out.size)
-          val gpuPtr = SparkEnv.get.cudaManager.allocGPUMemory(size)
-          JCuda.cudaMemsetAsync(gpuPtr, 0, size, stream)
+          val gpuPtr = cudaManager.allocGPUMemory(size)
+          cudaManager.memsetASync(gpuPtr, 0, size, devIx)
           gpuOutputPtrs = gpuOutputPtrs :+ gpuPtr
         }
 
@@ -144,31 +131,31 @@ class CUDAFunction(
             case v: Array[Byte] =>
               val len = v.length
               cpuInputFreeVars = cpuInputFreeVars :+ (Pointer.to(v), len)
-              SparkEnv.get.cudaManager.allocGPUMemory(len)
+              cudaManager.allocGPUMemory(len)
             case v: Array[Char] =>
               val len = v.length * 2
               cpuInputFreeVars = cpuInputFreeVars :+ (Pointer.to(v), len)
-              SparkEnv.get.cudaManager.allocGPUMemory(len)
+              cudaManager.allocGPUMemory(len)
             case v: Array[Short] =>
               val len = v.length * 2
               cpuInputFreeVars = cpuInputFreeVars :+ (Pointer.to(v), len)
-              SparkEnv.get.cudaManager.allocGPUMemory(len)
+              cudaManager.allocGPUMemory(len)
             case v: Array[Int] =>
               val len = v.length * 4
               cpuInputFreeVars = cpuInputFreeVars :+ (Pointer.to(v), len)
-              SparkEnv.get.cudaManager.allocGPUMemory(len)
+              cudaManager.allocGPUMemory(len)
             case v: Array[Long] =>
               val len = v.length * 8
               cpuInputFreeVars = cpuInputFreeVars :+ (Pointer.to(v), len)
-              SparkEnv.get.cudaManager.allocGPUMemory(len)
+              cudaManager.allocGPUMemory(len)
             case v: Array[Float] =>
               val len = v.length * 4
               cpuInputFreeVars = cpuInputFreeVars :+ (Pointer.to(v), len)
-              SparkEnv.get.cudaManager.allocGPUMemory(len)
+              cudaManager.allocGPUMemory(len)
             case v: Array[Double] =>
               val len = v.length * 8
               cpuInputFreeVars = cpuInputFreeVars :+ (Pointer.to(v), len)
-              SparkEnv.get.cudaManager.allocGPUMemory(len)
+              cudaManager.allocGPUMemory(len)
             case _ => throw new SparkException("Unsupported type passed to kernel "
             + "as a free variable argument")
           }
@@ -180,17 +167,16 @@ class CUDAFunction(
            if (out.blobBuffers != null) {out.blobBuffers} else {Array[ByteBuffer]()}
         for (blob <- outBlobBuffers) {
           val size = blob.capacity()
-          val gpuPtr = SparkEnv.get.cudaManager.allocGPUMemory(size)
-          JCuda.cudaMemsetAsync(gpuPtr, 0, size, stream)
+          val gpuPtr = cudaManager.allocGPUMemory(size)
+          cudaManager.memsetASync(gpuPtr, 0, size, devIx)
           gpuOutputBlobs = gpuOutputBlobs :+ gpuPtr
         }
 
         // perform allocGPUMemory and cudaMemcpyAsync
-        val gpuInputPtrs = in.orderedGPUPointers(inputColumnsOrder, stream)
+        val gpuInputPtrs = in.orderedGPUPointers(inputColumnsOrder, devIx)
 
         for (((cpuPtr, size), gpuPtr) <- (cpuInputFreeVars zip inputFreeVarPtrs)) {
-          JCuda.cudaMemcpyAsync(gpuPtr, cpuPtr, size,
-            cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+          cudaManager.memcpyH2DASync(gpuPtr, cpuPtr, size, devIx)
         }
 
         val gpuPtrParams = (gpuInputPtrs ++
@@ -208,8 +194,6 @@ class CUDAFunction(
             + "argument")
         }
 
-        val wrappedStream = new CUstream(stream)
-
         stagesCount match {
           // normal launch, no stages, suitable for map
           case None =>
@@ -220,16 +204,16 @@ class CUDAFunction(
 
             val (gpuGridSize, gpuBlockSize) = dimensions match {
               case Some(computeDim) => computeDim(in.size, 1)
-              case None => SparkEnv.get.cudaManager.computeDimensions(in.size)
+              case None => cudaManager.computeDimensions(in.size)
             }
 
-            JCudaDriver.cuLaunchKernel(
+            cudaManager.launchKernel(
               function,
               gpuGridSize, 1, 1,
               gpuBlockSize, 1, 1,
               0,
-              wrappedStream,
-              kernelParameters, null)
+              devIx,
+              kernelParameters)
 
           // launch kernel multiple times (multiple stages), suitable for reduce
           case Some(totalStagesFun) =>
@@ -253,39 +237,37 @@ class CUDAFunction(
                   throw new SparkException("Dimensions must be provided for multi-stage kernels")
               }
 
-              JCudaDriver.cuLaunchKernel(
+              cudaManager.launchKernel(
                 function,
                 gpuGridSize, 1, 1,
                 gpuBlockSize, 1, 1,
                 0,
-                wrappedStream,
-                kernelParameters, null)
+                devIx,
+                kernelParameters)
             }
         }
 
         val outPointers = out.orderedPointers(outputColumnsOrder)
         for ((cpuPtr, gpuPtr, col) <- (outPointers, gpuOutputPtrs, outColumns).zipped) {
-          JCuda.cudaMemcpyAsync(cpuPtr, gpuPtr, col.memoryUsage(out.size),
-            cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
+          cudaManager.memcpyD2HASync(cpuPtr, gpuPtr, col.memoryUsage(out.size), devIx)
         }
 
         for ((cpuPtr, gpuPtr, blob) <- (outBlobs, gpuOutputBlobs, outBlobBuffers).zipped) {
-          JCuda.cudaMemcpyAsync(cpuPtr, gpuPtr, blob.capacity(),
-            cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
+          cudaManager.memcpyD2HASync(cpuPtr, gpuPtr, blob.capacity(), devIx)
         }
 
         if (!in.gpuCache || ((gpuOutputPtrs.size + gpuOutputBlobs.size) > 0)) {
-          JCuda.cudaStreamSynchronize(stream)
+          cudaManager.streamSynchronize(devIx)
         }
         out.blockId = blockId
         out
       }  {
         in.freeGPUPointers()
         for (ptr <- gpuOutputPtrs) {
-          SparkEnv.get.cudaManager.freeGPUMemory(ptr)
+          cudaManager.freeGPUMemory(ptr)
         }
         for (ptr <- gpuOutputBlobs) {
-          SparkEnv.get.cudaManager.freeGPUMemory(ptr)
+          cudaManager.freeGPUMemory(ptr)
         }
       }
     } catch {
diff --git a/core/src/main/scala/org/apache/spark/cuda/CUDAManager.scala b/core/src/main/scala/org/apache/spark/cuda/CUDAManager.scala
index 2cd6960895..657d0ee818 100644
--- a/core/src/main/scala/org/apache/spark/cuda/CUDAManager.scala
+++ b/core/src/main/scala/org/apache/spark/cuda/CUDAManager.scala
@@ -25,19 +25,20 @@ import java.nio.ByteBuffer
 import java.nio.file.{Files, Paths}
 
 import jcuda.CudaException;
-import jcuda.Pointer
 import jcuda.driver.CUcontext
 import jcuda.driver.CUdevice
 import jcuda.driver.CUdevice_attribute
 import jcuda.driver.CUfunction
 import jcuda.driver.CUmodule
-import jcuda.driver.CUresult;
+import jcuda.driver.CUresult
+import jcuda.driver.CUstream
 import jcuda.driver.JCudaDriver
-import jcuda.runtime.cudaStream_t
+import jcuda.runtime.{cudaMemcpyKind, cudaStream_t}
 import jcuda.runtime.JCuda
 
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkException
+import org.apache.spark.unsafe.memory.Pointer
 
 import org.slf4j.Logger
 import org.slf4j.LoggerFactory
@@ -77,13 +78,14 @@ class CUDAManager {
   }
 
   private val allStreams = MutableList[Array[cudaStream_t]]()
+
   private def allocateThreadStreams: Array[cudaStream_t] = {
     val threadStreams = (0 to deviceCount - 1).map { devIx =>
       JCuda.cudaSetDevice(devIx)
       val stream = new cudaStream_t
       JCuda.cudaStreamCreateWithFlags(stream, JCuda.cudaStreamNonBlocking)
       stream
-    } .toArray
+    }.toArray
     synchronized {
       allStreams += threadStreams
     }
@@ -100,13 +102,17 @@ class CUDAManager {
     }
   }
 
+  private def getStream(devIx: Int): cudaStream_t = {
+    return streams.get.apply(devIx)
+  }
+
   /**
-   * Chooses a device to work on and returns a stream for it.
-   */
+    * Chooses a device to work on and returns a stream for it.
+    */
   // TODO make sure only specified amount of tasks at once uses GPU
   // TODO make sure that amount of conversions is minimized by giving GPU to appropriate tasks,
   // task context might be required for that
-  private[spark] def getStream(memoryUsage: Long, gpuDevIx: Int): (cudaStream_t, Int) = {
+  private[spark] def getDevice(memoryUsage: Long, gpuDevIx: Int): Int = {
     if (deviceCount == 0) {
       throw new SparkException("No available CUDA devices to create a stream")
     }
@@ -139,7 +145,7 @@ class CUDAManager {
         // allocating it now?)
         // TODO GPU memory pooling - no need to reallocate, since usually exact same sizes of memory
         // chunks will be required
-        return (streams.get.apply(devIx), devIx)
+        return devIx
       }
     }
 
@@ -147,7 +153,7 @@ class CUDAManager {
       s"($memoryUsage bytes needed)")
   }
 
-  private[spark] def cachedLoadModule(resource: Either[URL, (String, String)]): CUmodule = {
+  private def cachedLoadModule(resource: Either[URL, (String, String)]): CUmodule = {
     var resourceURL: URL = null
     var key: String = null
     var ptxString: String = null
@@ -181,7 +187,7 @@ class CUDAManager {
         }
 
         val moduleBinaryData0 = new Array[Byte](moduleBinaryData.length + 1)
-	System.arraycopy(moduleBinaryData, 0, moduleBinaryData0, 0, moduleBinaryData.length)
+        System.arraycopy(moduleBinaryData, 0, moduleBinaryData0, 0, moduleBinaryData.length)
         moduleBinaryData0(moduleBinaryData.length) = 0
         val module = new CUmodule
         JCudaDriver.cuModuleLoadData(module, moduleBinaryData0)
@@ -192,7 +198,7 @@ class CUDAManager {
 
   private[spark] def allocGPUMemory(size: Long): Pointer = {
     require(size >= 0)
-    val ptr = new Pointer
+    val ptr = new jcuda.Pointer
     if (CUDAManager.logger.isDebugEnabled()) {
       CUDAManager.logger.debug(s"Allocating ${size}B of GPU memory (Thread ID " +
         s"${Thread.currentThread.getId})");
@@ -201,12 +207,55 @@ class CUDAManager {
     if (result != CUresult.CUDA_SUCCESS) {
       throw new CudaException("Cannot allocate GPU memory: " + JCuda.cudaGetErrorString(result));
     }
-    assert(size == 0 || ptr != new Pointer())
-    ptr
+    assert(size == 0 || ptr != new jcuda.Pointer())
+    new Pointer(ptr)
   }
 
   private[spark] def freeGPUMemory(ptr: Pointer) {
-    JCuda.cudaFree(ptr)
+    JCuda.cudaFree(ptr.getJPointer())
+  }
+
+  private[spark] def memcpyH2DASync(gpuPtr: Pointer, cpuPtr: Pointer, length: Long, devIx: Int) {
+    JCuda.cudaMemcpyAsync(gpuPtr.getJPointer(), cpuPtr.getJPointer(), length,
+      cudaMemcpyKind.cudaMemcpyHostToDevice, getStream(devIx))
+  }
+
+  private[spark] def memcpyD2HASync(cpuPtr: Pointer, gpuPtr: Pointer, length: Long, devIx: Int) {
+    JCuda.cudaMemcpyAsync(cpuPtr.getJPointer(), gpuPtr.getJPointer(), length,
+      cudaMemcpyKind.cudaMemcpyDeviceToHost, getStream(devIx))
+  }
+
+  private[spark] def memsetASync(gpuPtr: Pointer, value: Byte, length: Long, devIx: Int) {
+    JCuda.cudaMemsetAsync(gpuPtr.getJPointer(), value, length, getStream(devIx))
+  }
+
+  private[spark] def streamSynchronize(devIx: Int) {
+    JCuda.cudaStreamSynchronize(getStream(devIx))
+  }
+
+  private[spark] def moduleGetFunction(resource:Any, kernelSignature: String): CUfunction = {
+    val module = resource match {
+      case url: URL => cachedLoadModule(Left(url))
+      case (name: String, ptx: String) => cachedLoadModule(Right(name, ptx))
+      case _ => throw new SparkException("Unsupported resource type for CUDAFunction")
+    }
+    val function = new CUfunction
+    JCudaDriver.cuModuleGetFunction(function, module, kernelSignature)
+    function
+  }
+
+  private[spark] def launchKernel(f: CUfunction,
+                                   gridDimX: Int, gridDimY: Int, gridDimZ: Int,
+                                   blockDimX: Int, blockDimY: Int, blockDimZ: Int,
+                                   sharedMemBytes: Int, devIx: Int,
+                                   kernelParams: Pointer) {
+    val stream = getStream(devIx)
+    val wrappedStream = new CUstream(stream)
+    JCudaDriver.cuLaunchKernel(f,
+      gridDimX, gridDimY, gridDimZ,
+      blockDimX, blockDimY, blockDimZ,
+      sharedMemBytes, wrappedStream,
+      kernelParams.getJPointer(), null)
   }
 
   private[spark] def computeDimensions(size: Long): (Int, Int) = {
@@ -228,11 +277,8 @@ class CUDAManager {
   private[spark] def stop() {
     allStreams.flatten.foreach(JCuda.cudaStreamDestroy(_))
   }
-
 }
 
 object CUDAManager {
-
   private final val logger: Logger = LoggerFactory.getLogger(classOf[CUDAManager])
-
 }
diff --git a/core/src/main/scala/org/apache/spark/cuda/GPUMemoryManager.scala b/core/src/main/scala/org/apache/spark/cuda/GPUMemoryManager.scala
index e2887958ad..c321eb65f5 100644
--- a/core/src/main/scala/org/apache/spark/cuda/GPUMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/cuda/GPUMemoryManager.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.cuda
 
-import jcuda.Pointer
 import org.apache.spark.{SparkEnv, SparkException}
 import org.apache.spark.rpc.{RpcEndpointRef, RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
+import org.apache.spark.unsafe.memory.Pointer
 
-import scala.collection.mutable
 import scala.collection.mutable.{ListBuffer, HashMap}
 
 
diff --git a/core/src/test/scala/org/apache/spark/cuda/CUDAManagerSuite.scala b/core/src/test/scala/org/apache/spark/cuda/CUDAManagerSuite.scala
index d24ed64ffa..02aec9e02b 100644
--- a/core/src/test/scala/org/apache/spark/cuda/CUDAManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/cuda/CUDAManagerSuite.scala
@@ -19,12 +19,9 @@ package org.apache.spark.cuda
 
 import java.nio.ByteBuffer
 
-import jcuda.Pointer
-import jcuda.runtime.cudaMemcpyKind
-import jcuda.runtime.JCuda
-
 import org.apache.spark._
 import org.apache.spark.util.Utils
+import org.apache.spark.unsafe.memory.Pointer
 
 class CUDAManagerSuite extends SparkFunSuite with LocalSparkContext {
 
@@ -34,19 +31,17 @@ class CUDAManagerSuite extends SparkFunSuite with LocalSparkContext {
     sc = new SparkContext("local", "test", conf)
     val manager = SparkEnv.get.cudaManager
     if (manager.deviceCount > 0) {
-      val stream = manager.getStream(1024, -1)
+      val devIx = manager.getDevice(1024, -1)
 
       val gpuPtr = manager.allocGPUMemory(1024)
       Utils.tryWithSafeFinally {
-        JCuda.cudaMemcpy(gpuPtr, Pointer.to(Array.fill[Byte](1024)(42)), 1024,
-          cudaMemcpyKind.cudaMemcpyHostToDevice)
+        manager.memcpyH2DASync(gpuPtr, Pointer.to(Array.fill[Byte](1024)(42)), 1024, devIx)
         val arr = new Array[Byte](1024)
-        JCuda.cudaMemcpy(Pointer.to(ByteBuffer.wrap(arr)), gpuPtr, 1024,
-          cudaMemcpyKind.cudaMemcpyDeviceToHost)
-
+        manager.memcpyD2HASync(Pointer.to(ByteBuffer.wrap(arr)), gpuPtr, 1024, devIx)
+        manager.streamSynchronize(devIx)
         assert(arr.forall(_ == 42))
       } {
-        JCuda.cudaFree(gpuPtr)
+        manager.freeGPUMemory(gpuPtr)
       }
     } else {
       info("No CUDA devices, so skipping the test.")
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
index f0f28f8830..33f9b467be 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
@@ -19,6 +19,7 @@
 
 import javax.annotation.concurrent.GuardedBy;
 import java.lang.ref.WeakReference;
+import java.nio.ByteBuffer;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.Map;
@@ -26,14 +27,10 @@
 
 import org.apache.spark.unsafe.Platform;
 
-import jcuda.Pointer;
-import jcuda.driver.CUresult;
-import jcuda.runtime.JCuda;
-import jcuda.CudaException;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+
 /**
  * A simple {@link MemoryAllocator} that can allocate up to 16GB using a JVM long primitive array.
  */
@@ -46,7 +43,7 @@ public class HeapMemoryAllocator implements MemoryAllocator {
 
   @GuardedBy("this")
   private final Map<Long, LinkedList<WeakReference<MemoryBlock>>> bufferPoolsBySize =
-    new HashMap<>();
+          new HashMap<>();
 
   // TODO instead of allocating pinned memory allocate normal page-aligned memory (valloc with
   // JNI?) and pin it in CUDAManager
@@ -57,7 +54,7 @@ public class HeapMemoryAllocator implements MemoryAllocator {
   // Pointer content won't release itself, so no need for WeakReference
   @GuardedBy("this")
   private final Map<Long, LinkedList<Pointer>> pinnedMemoryBySize =
-    new HashMap<Long, LinkedList<Pointer>>();
+          new HashMap<Long, LinkedList<Pointer>>();
 
   private final Map<Pointer, Long> pinnedMemorySizes = new HashMap<Pointer, Long>();
 
@@ -139,7 +136,7 @@ public Pointer allocatePinnedMemory(long size) {
     synchronized (this) {
       final LinkedList<Pointer> pool = pinnedMemoryBySize.get(size);
       if (pool != null) {
-        assert(!pool.isEmpty());
+        assert (!pool.isEmpty());
         final Pointer ptr = pool.pop();
         if (pool.isEmpty()) {
           pinnedMemoryBySize.remove(size);
@@ -152,24 +149,17 @@ public Pointer allocatePinnedMemory(long size) {
       // TODO might be better to start from LRU size, currently freeing in arbitrary order
       if (maxPinnedMemory >= 0 && allocatedPinnedMemory + size > maxPinnedMemory) {
         Iterator<Map.Entry<Long, LinkedList<Pointer>>> it =
-          pinnedMemoryBySize.entrySet().iterator();
+                pinnedMemoryBySize.entrySet().iterator();
 
         while (it.hasNext() && allocatedPinnedMemory + size > maxPinnedMemory) {
           Map.Entry<Long, LinkedList<Pointer>> sizeAndList = it.next();
-          assert(!sizeAndList.getValue().isEmpty());
+          assert (!sizeAndList.getValue().isEmpty());
 
           Iterator<Pointer> listIt = sizeAndList.getValue().iterator();
 
           do {
             Pointer ptr = listIt.next();
-            try {
-              int result = JCuda.cudaFreeHost(ptr);
-              if (result != CUresult.CUDA_SUCCESS) {
-                throw new CudaException(JCuda.cudaGetErrorString(result));
-              }
-            } catch (CudaException ex) {
-              throw new OutOfMemoryError("Could not free pinned memory: " + ex.getMessage());
-            }
+            freeMemory(ptr);
             listIt.remove();
             pinnedMemorySizes.remove(ptr);
             allocatedPinnedMemory -= sizeAndList.getKey();
@@ -185,15 +175,7 @@ public Pointer allocatePinnedMemory(long size) {
         }
       }
 
-      Pointer ptr = new Pointer();
-      try {
-        int result = JCuda.cudaHostAlloc(ptr, size, JCuda.cudaHostAllocPortable);
-        if (result != CUresult.CUDA_SUCCESS) {
-          throw new CudaException(JCuda.cudaGetErrorString(result));
-        }
-      } catch (Exception ex) {
-        throw new OutOfMemoryError("Could not alloc pinned memory: " + ex.getMessage());
-      }
+      Pointer ptr = allocateMemory(size);
       pinnedMemorySizes.put(ptr, size);
       allocatedPinnedMemory += size;
       return ptr;
@@ -223,15 +205,8 @@ protected void finalize() {
     // Deallocating off-heap pinned memory pool
     for (Map.Entry<Long, LinkedList<Pointer>> sizeAndList : pinnedMemoryBySize.entrySet()) {
       for (Pointer ptr : sizeAndList.getValue()) {
-        try {
-          int result = JCuda.cudaFreeHost(ptr);
-          if (result != CUresult.CUDA_SUCCESS) {
-            throw new CudaException(JCuda.cudaGetErrorString(result));
-          }
-          allocatedPinnedMemory -= sizeAndList.getKey();
-        } catch (CudaException ex) {
-          throw new OutOfMemoryError("Could not free pinned memory: " + ex.getMessage());
-        }
+        freeMemory(ptr);
+        allocatedPinnedMemory -= sizeAndList.getKey();
       }
     }
 
@@ -263,4 +238,30 @@ long getUsedAllocatedPinnedMemorySize() {
       return size;
     }
   }
+
+
+  private Pointer allocateMemory(long size) {
+    jcuda.Pointer ptr = new jcuda.Pointer();
+    try {
+      int result = jcuda.runtime.JCuda.cudaHostAlloc(ptr, size,
+              jcuda.runtime.JCuda.cudaHostAllocPortable);
+      if (result != jcuda.driver.CUresult.CUDA_SUCCESS) {
+        throw new jcuda.CudaException(jcuda.runtime.JCuda.cudaGetErrorString(result));
+      }
+    } catch (jcuda.CudaException ex) {
+      throw new OutOfMemoryError("Could not alloc pinned memory: " + ex.getMessage());
+    }
+    return new Pointer(ptr);
+  }
+
+  private void freeMemory(Pointer ptr) {
+    try {
+      int result = jcuda.runtime.JCuda.cudaFreeHost(ptr.getJPointer());
+      if (result != jcuda.driver.CUresult.CUDA_SUCCESS) {
+        throw new jcuda.CudaException(jcuda.runtime.JCuda.cudaGetErrorString(result));
+      }
+    } catch (jcuda.CudaException ex) {
+      throw new OutOfMemoryError("Could not free pinned memory: " + ex.getMessage());
+    }
+  }
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/Pointer.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/Pointer.java
new file mode 100644
index 0000000000..5e9578e382
--- /dev/null
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/Pointer.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.memory;
+
+import java.nio.Buffer;
+import java.nio.ByteBuffer;
+
+
+public final class Pointer {
+  private jcuda.Pointer  jcudaPointer;
+  public Pointer(jcuda.Pointer ptr) {
+    jcudaPointer = ptr;
+  }
+  public jcuda.Pointer getJPointer() {
+    return jcudaPointer;
+  }
+  public ByteBuffer getByteBuffer(long offset, long size) {
+    return jcudaPointer.getByteBuffer(offset, size);
+  }
+  public static Pointer to(Buffer buffer) { return new Pointer(jcuda.Pointer.to(buffer)); }
+  public static Pointer to(byte[] values) { return new Pointer(jcuda.Pointer.to(values)); }
+  public static Pointer to(char[] values) { return new Pointer(jcuda.Pointer.to(values)); }
+  public static Pointer to(double[] values) { return new Pointer(jcuda.Pointer.to(values)); }
+  public static Pointer to(float[] values) { return new Pointer(jcuda.Pointer.to(values)); }
+  public static Pointer to(int[] values) { return new Pointer(jcuda.Pointer.to(values)); }
+  public static Pointer to(long[] values) { return new Pointer(jcuda.Pointer.to(values)); }
+  public static Pointer to(short[] values) { return new Pointer(jcuda.Pointer.to(values)); }
+  public static Pointer to(Pointer... pointers) {
+    int len = pointers.length;
+    jcuda.Pointer[] jpointers = new jcuda.Pointer[len];
+    for (int i = 0; i < len; i++) { jpointers[i] = pointers[i].getJPointer(); }
+    return new Pointer(jcuda.Pointer.to(jpointers));
+  }
+  public static Pointer to(jcuda.NativePointerObject... pointers) {
+    return new Pointer(jcuda.Pointer.to(pointers));
+  }
+}
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/memory/HeapMemoryAllocatorSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/memory/HeapMemoryAllocatorSuite.java
index 13ba6784f2..63a457ccc5 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/memory/HeapMemoryAllocatorSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/memory/HeapMemoryAllocatorSuite.java
@@ -17,10 +17,6 @@
 
 package org.apache.spark.unsafe.memory;
 
-import jcuda.Pointer;
-import jcuda.driver.JCudaDriver;
-import jcuda.runtime.JCuda;
-
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -30,9 +26,12 @@ public class HeapMemoryAllocatorSuite {
   @BeforeClass
   public static void setUp() {
     // normally it's set by CUDAManager
-    JCudaDriver.setExceptionsEnabled(true);
-    JCudaDriver.cuInit(0);
-    JCuda.cudaSetDevice(0);
+    /*
+    @@@CUDA
+    jcuda.driver.JCudaDriver.setExceptionsEnabled(true);
+    jcuda.driver.JCudaDriver.cuInit(0);
+    jcuda.runtime.JCuda.cudaSetDevice(0);
+    */
   }
 
   @Test
@@ -44,7 +43,7 @@ public void allocatedAndFreedMemoryIsPooled() {
 
     Pointer ptr1 = manager.allocatePinnedMemory(2048);
     // this is a way of checking that ptr1 is not null internally (native address is private)
-    Assert.assertTrue(!ptr1.equals(new Pointer()));
+    //Assert.assertTrue(!ptr1.equals(new Pointer()));
     Assert.assertEquals(2048, manager.getAllocatedPinnedMemorySize());
     Assert.assertEquals(2048, manager.getUsedAllocatedPinnedMemorySize());