generalized fast transpose/dimshuffle

Scott Gray · apark263 · commit 682dde664d0e · 2016-02-23T00:05:27.000-08:00
diff --git a/neon/backends/convolution.py b/neon/backends/convolution.py
@@ -22,6 +22,11 @@
 from neon.backends import kernel_specs
 from neon.backends.cuda_templates import _common_round, _ew_types
 from math import ceil
+from operator import mul
+import sys
+
+if sys.version_info >= (3, 0):
+    from functools import reduce
 
 
 class KernelGroup(object):
@@ -970,3 +975,179 @@ def _get_shuffle_kernel(dtype):
     kernel = module.get_function("dimShuffle")
     kernel.prepare("PPIIIIIIIIIIIIIIII")
     return kernel
+
+
+@context_dependent_memoize
+def _get_copy_transpose_kernel(dtype, shape, axes=None):
+
+    src = range(len(shape))
+    dst = list(axes)
+
+    src_contig = src[-1]
+    dst_contig = dst[-1]
+
+    assert src_contig != dst_contig, "Inner dimension must change (for now)"
+
+    dim_params = []
+    dim_values = []
+    in_offset = []
+    out_offset = []
+    magic_params = []
+    magic_values = []
+    magic = ""
+
+    for i, s in enumerate(src):
+
+        idx = "".join(str(x) for x in src[i+1:])
+        val = reduce(mul, (shape[x] for x in src[i+1:]), 1)
+
+        if s == dst_contig:
+            in_dim_j = "dim_%s" % idx
+        elif idx:
+            in_offset.append("idx_%d*dim_%s" % (s, idx))
+        else:
+            in_offset.append("idx_%d" % s)
+
+        if idx:
+            dim_params.append("int dim_%s" % idx)
+            dim_values.append(val)
+
+    for i, d in enumerate(dst):
+
+        idx = "".join(str(x) for x in dst[i+1:])
+        val = reduce(mul, (shape[x] for x in dst[i+1:]), 1)
+
+        if d == src_contig:
+            out_dim_j = "dim_%s" % idx
+
+        if idx:
+            dim_params.append("int dim_%s" % idx)
+            dim_values.append(val)
+
+            out_offset.append("idx_%d*dim_%s" % (d, idx))
+        else:
+            out_offset.append("idx_%d" % d)
+
+    src2 = list(src)
+    src2[dst_contig:dst_contig+1] = ()
+
+    blk = compound_idx = "".join(str(x) for x in src2)
+
+    grid_shape = list(shape)
+    grid_shape[src_contig] = _ceil_div(shape[src_contig], 32)
+    grid_shape[dst_contig] = _ceil_div(shape[dst_contig], 32)
+
+    while len(src2) > 1:
+
+        idx1 = src2[0]
+        src2[0:1] = ()
+        idx2 = "".join(str(i) for i in src2)
+        div = reduce(mul, (grid_shape[i] for i in src2), 1)
+
+        magic_params.append("int magic_%s, int shift_%s, int div_%s" % (idx2, idx2, idx2))
+        magic_values.append(_magic64(div))
+        magic_values.append(div)
+
+        magic += r"""
+    int idx_{1} = div64(idx_{0}, magic_{2}, shift_{2});
+    int idx_{2} = idx_{0} - idx_{1}*div_{2};
+""".format(compound_idx, idx1, idx2)
+
+        compound_idx = idx2
+
+    params = _flatten([dim_params, magic_params])
+    values = _flatten([dim_values, magic_values])
+
+    shuffle_kernel = r"""
+__device__ __forceinline__ int div64(int value, int magic, int shift)
+{
+    // if the divisor is a power of 2 the magic will be 1 and it's just a simple right shift
+    // Otherwise multiply by magic and right shift just the high bits
+    int result;
+    asm("{\n\t"
+        ".reg .pred p;\n\t"
+        ".reg .u64 res64;\n\t"
+        ".reg .u32 lo32, hi32;\n\t"
+        "setp.ne.s32 p, %%2, 1;\n\t"
+        "mul.wide.u32 res64, %%1, %%2;\n\t"
+        "mov.b64 {lo32, hi32}, res64;\n\t"
+        "selp.u32 hi32, hi32, %%1, p;\n\t"
+        "shr.u32 %%0, hi32, %%3;\n\t"
+        "}" : "=r"(result) : "r"(value), "r"(magic), "r"(shift));
+    return result;
+}
+
+__global__ void copy_transpose(%(type)s* out, const %(type)s* in, %(params)s)
+{
+    __shared__ %(type)s tile[32][33];
+
+    int tid_x = threadIdx.x;
+    int tid_y = threadIdx.y;
+    int idx_%(blk)s = blockIdx.x;
+    int idx_%(dst)s = blockIdx.y;
+
+    %(magic)s
+
+    idx_%(src)s = (idx_%(src)s << 5) + tid_x;
+    idx_%(dst)s = (idx_%(dst)s << 5) + tid_y;
+
+    int offset = %(in_offset)s;
+
+    #pragma unroll
+    for (int j = 0; j < 32; j += 8)
+    {
+        int idx_%(dst)sj = idx_%(dst)s + j;
+        if (idx_%(dst)sj < dim_%(dst)s && idx_%(src)s < dim_%(src)s)
+            tile[tid_y + j][tid_x] = in[idx_%(dst)sj*%(in_dim_j)s + offset];
+    }
+    __syncthreads();
+
+    %(type)s val00 = tile[tid_x][tid_y +  0];
+    %(type)s val08 = tile[tid_x][tid_y +  8];
+    %(type)s val16 = tile[tid_x][tid_y + 16];
+    %(type)s val24 = tile[tid_x][tid_y + 24];
+
+    idx_%(src)s += tid_y - tid_x;
+    idx_%(dst)s += tid_x - tid_y;
+
+    bool b%(dst)s = idx_%(dst)s < dim_%(dst)s;
+
+    %(type)s* out00 = out + %(out_offset)s;
+    %(type)s* out08 = out00 + %(out_dim_j)s*8;
+    %(type)s* out16 = out08 + %(out_dim_j)s*8;
+    %(type)s* out24 = out16 + %(out_dim_j)s*8;
+
+    if (idx_%(src)s +  0 < dim_%(src)s && b%(dst)s) *out00 = val00;
+    if (idx_%(src)s +  8 < dim_%(src)s && b%(dst)s) *out08 = val08;
+    if (idx_%(src)s + 16 < dim_%(src)s && b%(dst)s) *out16 = val16;
+    if (idx_%(src)s + 24 < dim_%(src)s && b%(dst)s) *out24 = val24;
+}
+"""
+    code = shuffle_kernel % dict(
+        type=_ew_types[dtype[1:]]["type"],
+        params=", ".join(params),
+        blk=blk,
+        src=src_contig,
+        dst=dst_contig,
+        magic=magic,
+        in_offset=" + ".join(in_offset),
+        out_offset=" + ".join(out_offset),
+        in_dim_j=in_dim_j,
+        out_dim_j=out_dim_j
+    )
+    module = SourceModule(code)
+    kernel = module.get_function("copy_transpose")
+    kernel.prepare("PP" + "I"*len(values))
+
+    grid_x = grid_shape[src_contig]
+    grid_y = grid_shape[dst_contig]
+    for s in src:
+        if s not in (src_contig, dst_contig):
+            grid_x *= grid_shape[s]
+
+    return dict(
+        kernel=kernel,
+        grid=(grid_x, grid_y, 1),
+        block=(32, 8, 1),
+        args=values
+    )
diff --git a/neon/backends/nervanagpu.py b/neon/backends/nervanagpu.py
@@ -2231,6 +2231,47 @@ def cublas_dot(self, A, B, C, alpha=1.0, beta=0.0):
         else:
             raise TypeError("Unsupported type for cublas gemm")
 
+    def copy_transpose(self, a, out, axes=None, repeat=1):
+        """
+        Function to perform a fast copy transpose/dimshuffle operation.
+        Works just like numpy.transpose, but requires an output tensor argument.
+        """
+        assert a.dtype == out.dtype
+        assert a.size == out.size
+        assert a.gpudata != out.gpudata
+
+        if axes is None:
+            axes = tuple(range(len(a.shape)-1,-1,-1))
+        elif type(axes) is not tuple:
+            axes = tuple(axes)
+
+        assert all(out.shape[i]==a.shape[x] for i,x in enumerate(axes))
+
+        from neon.backends.convolution import _get_copy_transpose_kernel
+
+        kernel_data = _get_copy_transpose_kernel(a.dtype.str, a.shape, axes)
+
+        # Warmup
+        if repeat > 1:
+            for r in range(max(repeat // 10, 1)):
+                kernel_data["kernel"].prepared_async_call(kernel_data["grid"], kernel_data["block"],
+                    self.stream, out.gpudata, a.gpudata, *kernel_data["args"])
+
+        if self.bench > 1 or repeat > 1:
+            start, end = _get_events()
+            start.record(self.stream)
+
+        for r in range(repeat):
+            kernel_data["kernel"].prepared_async_call(kernel_data["grid"], kernel_data["block"],
+                self.stream, out.gpudata, a.gpudata, *kernel_data["args"])
+
+        if self.bench > 1 or repeat > 1:
+            end.record(self.stream)
+            end.synchronize()
+            msecs = end.time_since(start) / repeat
+            bandwidth = a.nbytes*2 / (msecs * 1024 * 1024)
+            print("%7.3f msecs %4.0f GBps copy_transpose" % (msecs, bandwidth))
+
     def init_mark(self):
         """
         Generate a timing mark object
diff --git a/neon/data/imageloader.py b/neon/data/imageloader.py
@@ -111,7 +111,7 @@ def __init__(self, repo_dir, inner_size, scale_range, do_transforms=True,
 
         # View for subtracting the mean.
         # Find a shape that's fast for ew broadcast
-        image_dim = self.data.reshape((ishape[0],-1)).shape[1]
+        image_dim = self.data.reshape((ishape[0], -1)).shape[1]
         fast_dim = [i for i in range(1, 257) if image_dim % i == 0][-1]
         self.data_view = self.data.reshape((ishape[0], image_dim//fast_dim, fast_dim))