diff --git a/src/nnlib.jl b/src/nnlib.jl
index ef3e85b..bd92079 100644
--- a/src/nnlib.jl
+++ b/src/nnlib.jl
@@ -2,18 +2,31 @@ using NNlib
 using NNlib: expand
 using NNlib: PoolDims
 
-import NNlib: conv
+import NNlib: conv, depthwiseconv
 
-function NNlib.conv(x::Tensor{xT, N}, w::Tensor, b::Tensor{T}, cdims::DenseConvDims{M,K,C_in,C_out,S,P,D,F};
-                    stride = 1, pad = 0, dilation = 1) where {T,N, xT,  M,K,C_in,C_out,S,P,D,F}
-  
-  op = conv2d(x, w, b, stride = collect(S), padding = [P[1];P[3]], dilation = collect(dilation))
+function NNlib.conv(x::Tensor{xT, N}, w::Tensor, b::Tensor{T},
+                    cdims::DenseConvDims{M,K,C_in,C_out,S,P,D,F}) where {T,N,xT,M,K,C_in,C_out,S,P,D,F}
+  op = conv2d(x, w, b, stride = collect(S), padding = [P[1];P[3]], dilation = collect(D))
   op
 end
 
-function NNlib.conv(x::Tensor, w::Tensor, cdims::DenseConvDims; stride = 1, pad = 0, dilation = 1)
+function NNlib.conv(x::Tensor, w::Tensor, cdims::DenseConvDims)
   b = zeros(Tensor{Float32}, size(w)[end], dev = on(w))
-  op = conv(x, w, b, cdims, stride = stride, pad = pad, dilation = dilation)
+  op = conv(x, w, b, cdims)
+  op
+end
+
+function NNlib.depthwiseconv(x::Tensor{xT, N}, w::Tensor, b::Tensor{T};
+                             stride = 1, pad = 0, dilation = 1) where {T, N, xT}
+  op = _depthwise_conv2d(x, w, b, stride = collect(stride), padding = collect(pad),
+                         dilation = collect(dilation))
+  op
+end
+
+function NNlib.depthwiseconv(x::Tensor, w::Tensor; stride = 1, pad = 0, dilation = 1)
+  b = zeros(Tensor{Float32}, size(w)[end], dev = on(w))
+  op = depthwiseconv(x, w, b, stride = collect(stride), pad = collect(pad),
+                     dilation = collect(dilation))
   op
 end
 
@@ -38,8 +51,15 @@ function NNlib.sigmoid(t::Tensor{T,N}) where {T,N}
   Tensor{T,N}(ptr[], on(t))
 end
 
+function NNlib.tanh(t::Tensor{T,N}) where {T,N}
+  ptr = Ref(Ptr{Cvoid}())
+
+  atg_tanh(ptr, t.ptr)
+  Tensor{T,N}(ptr[], on(t))
+end
+
 function NNlib.softmax(t::Tensor{T,N}; dims = 1) where {T,N}
-  _softmax(t, N - dims, options[T])
+  _softmax(t, dims, options[T])
 end
 
 function NNlib.∇softmax(Δ, xs::Tensor; dims = 1)
@@ -48,15 +68,20 @@ function NNlib.∇softmax(Δ, xs::Tensor; dims = 1)
   sf .* (t .- sum(t .* sf, dims = dims))
 end
 
-function NNlib.meanpool(t::Tensor, pdims::PoolDims{N,K,S,P,D}; kw...) where {N,K,S,P,D}
+function NNlib.meanpool(t::Tensor, pdims::PoolDims{N,K,S,P,D}) where {N,K,S,P,D}
   ks = collect(NNlib.kernel_size(pdims))
   stride = collect(S)
-  pad = [P[1];P[3]]
-  op_sz = NNlib.output_size(pdims)
+  padding = [P[1];P[3]]
+  # op_sz = NNlib.output_size(pdims)
 
-  _meanpool(t, ks, stride, pad, op_sz)
+  _meanpool(t, ks, stride=stride, padding=padding)
 end
 
 function NNlib.maxpool(t::Tensor, pdims::PoolDims{N,K,S,P,D}) where {N,K,S,P,D}
-  _maxpool(t, pdims)
+  ks = collect(NNlib.kernel_size(pdims))
+  stride = collect(S)
+  padding = [P[1];P[3]]
+  dilation = collect(D)
+
+  _maxpool(t, ks, stride=stride, padding=padding, dilation=dilation)
 end
diff --git a/src/ops.jl b/src/ops.jl
index 855b00a..09f71b4 100644
--- a/src/ops.jl
+++ b/src/ops.jl
@@ -82,39 +82,102 @@ end
 
 # TODO: Use a macro to generate wrappers
 function conv2d(input::Tensor{T}, filter::Tensor{T,N}, bias::Tensor{T};
-		stride = [1],
-		padding = [0],
-		dilation = [1],
-		groups = 1) where {T,N}
+                stride = [1],
+                padding = [0],
+                dilation = [1],
+                groups = 1) where {T,N}
 
   ptr = Ref(Ptr{Cvoid}())
 
   atg_conv2d(ptr, input.ptr, filter.ptr, bias.ptr,
-                stride, length(stride),
-                padding, length(padding),
-                dilation, length(dilation),
-                groups)
+             reverse(stride), length(stride),
+             reverse(padding), length(padding),
+             reverse(dilation), length(dilation),
+             groups)
 
   Tensor{T,N}(ptr[], on(input))
 end
 
+function conv_transpose_2d(input::Tensor{T}, filter::Tensor{T,N}, bias::Tensor{T};
+        stride = [1],
+        padding = [0],
+        output_padding = [0],
+        dilation = [1],
+        groups = 1) where {T,N}
+
+  ptr = Ref(Ptr{Cvoid}())
+
+  atg_conv_transpose2d(ptr, input.ptr, filter.ptr, bias.ptr,
+						reverse(stride), length(stride),
+                       reverse(padding), length(padding),
+                       reverse(output_padding), length(output_padding),
+                       groups,
+                       reverse(dilation), length(dilation))
+
+  Tensor{T,N}(ptr[], on(input))
+end
+
+function _depthwise_conv2d(input::Tensor{T}, filter::Tensor{T,N}, bias::Tensor{T};
+                           stride = [1],
+                           padding = [0],
+                           dilation = [1]) where {T,N}
+
+    # When groups == in_channels and out_channels == K * in_channels, where K is a positive integer,
+    # this operation is also termed in literature as depthwise convolution.
+
+    c_in = size(input)[end - 1]  # number of input channels
+    c_out = size(filter)[end]    # number of output channels
+    @assert mod(c_in, c_out) == 0 "Invalid kernel size for depthwise convolution"
+
+    groups = c_in
+    ptr = Ref(Ptr{Cvoid}())
+
+    atg_conv2d(ptr, input.ptr, filter.ptr, bias.ptr,
+               reverse(stride), length(stride),
+               reverse(padding), length(padding),
+               reverse(dilation), length(dilation),
+               groups)
+
+    Tensor{T,N}(ptr[], on(input))
+end
+
 function _softmax(input::Tensor{T,N}, dims = 1, dtype = options[T]) where {T,N}
   ptr = Ref(Ptr{Cvoid}())
 
-  atg_softmax(ptr, input.ptr, N - dims - 1, dtype)
+  atg_softmax(ptr, input.ptr, N - dims, dtype)
   Tensor{T,N}(ptr[], on(input))
 end
 
-function _meanpool(t::Tensor{T,N}, k, s, p, op_sz) where {T,N}
+function _meanpool(t::Tensor{T,N}, kernel_size; stride = [1] , padding = [0]) where {T,N}
+  k = collect(kernel_size)
+  s = collect(stride)
+  p = collect(padding)
   ptr = Ref(Ptr{Cvoid}())
 
   atg_avg_pool2d(ptr, t.ptr,
-                 k, length(k),
-                 s, length(s),
-                 p, length(p),
-                 0,                # ceil_mode
-                 1,                # count_include_pad
-                 1                 # divisor_override
+                 reverse(k), length(k),
+                 reverse(s), length(s),
+                 reverse(p), length(p),
+                 0,  # ceil_mode
+                 1,  # count_include_pad
+                 prod(k)  # divisor_override
+  )
+  Tensor{T,N}(ptr[], on(t))
+end
+
+function _maxpool(t::Tensor{T,N}, kernel_size; stride = [1], padding = [0], dilation = [1]) where {T,N}
+  k = collect(kernel_size)
+  s = collect(stride)
+  p = collect(padding)
+  d = collect(dilation)
+  ptr = Ref(Ptr{Cvoid}())
+
+  atg_max_pool2d(ptr, t.ptr,
+                 reverse(k), length(k),
+                 reverse(s), length(s),
+                 reverse(p), length(p),
+                 reverse(d), length(d),
+                 0,  # ceil_mode
   )
   Tensor{T,N}(ptr[], on(t))
 end
@@ -129,10 +192,10 @@ function _maxpool(t::Tensor{T,M}, pdims::PoolDims{N,K,S,P,D};
   ptr = Ref(Ptr{Cvoid}())
 
   atg_max_pool2d(ptr, t.ptr,
-                 k, length(k),
-                 s, length(s),
-                 p, length(p),
-                 d, length(d),
+                 reverse(k), length(k),
+                 reverse(s), length(s),
+                 reverse(p), length(p),
+                 reverse(d), length(d),
                  ceil_mode,                # ceil_mode
   )
 
@@ -149,16 +212,67 @@ function _maxpool_with_inds(t::Tensor{T,M}, pdims::PoolDims{N,K,S,P,D};
   ptr = [Ptr{Cvoid}(), Ptr{Cvoid}()]
 
   atg_max_pool2d_with_indices(ptr, t.ptr,
-                 k, length(k),
-                 s, length(s),
-                 p, length(p),
-                 d, length(d),
-                 ceil_mode,                # ceil_mode
+                              reverse(k), length(k),
+                              reverse(s), length(s),
+                              reverse(p), length(p),
+                              reverse(d), length(d),
+                              ceil_mode,
   )
 
   Tensor{T,M}(ptr[1], on(t)), Tensor{T,M}(ptr[2], on(t))
 end
 
+function _upsample_nearest2d(t::Tensor{T,N}, output_size) where {T,N}
+  ptr = Ref(Ptr{Cvoid}())
+
+  atg_upsample_nearest2d(ptr, t.ptr,
+                         reverse(output_size), length(output_size),
+  )
+  Tensor{T,N}(ptr[], on(t))
+end
+
+function _upsample_bilinear2d(t::Tensor{T,N}, output_size, align_corners = true) where {T,N}
+  ptr = Ref(Ptr{Cvoid}())
+
+  atg_upsample_bilinear2d(ptr, t.ptr,
+                         reverse(output_size), length(output_size),
+                         align_corners,
+  )
+  Tensor{T,N}(ptr[], on(t))
+end
+
+function _upsample_bicubic2d(t::Tensor{T,N}, output_size, align_corners = true) where {T,N}
+  ptr = Ref(Ptr{Cvoid}())
+
+  atg_upsample_bicubic2d(ptr, t.ptr,
+                         reverse(output_size), length(output_size),
+                         align_corners,
+  )
+  Tensor{T,N}(ptr[], on(t))
+end
+
+function upsample(t::Tensor{T,N}, output_size, mode) where {T,N}
+    if mode == :NEAREST
+        _upsample_nearest2d(t, output_size)
+    elseif mode == :LINEAR
+        _upsample_bilinear2d(t, output_size)
+    elseif mode == :CUBIC
+        _upsample_bicubic2d(t, output_size)
+    else
+       error("Unsupported mode $(mode).")
+    end
+end
+
+function pad(t::Tensor{T,N}, padding) where {T,N}
+  ptr = Ref(Ptr{Cvoid}())
+  p = collect(padding)
+
+  atg_constant_pad_nd(ptr, t.ptr,
+                      p, length(p),
+  )
+  Tensor{T,N}(ptr[], on(t))
+end
+
 function _chunk(t::Tensor{T,N}, chunks=2, dims=1) where {T,N}
   ts = [Ptr{Cvoid}() for _ in 1:chunks]
   atg_chunk(ts, t.ptr, chunks, N - dims)
diff --git a/test/test_nnlib.jl b/test/test_nnlib.jl
new file mode 100644
index 0000000..db9eeff
--- /dev/null
+++ b/test/test_nnlib.jl
@@ -0,0 +1,203 @@
+using Test
+using NNlib
+using Torch: tensor
+
+
+@testset "DepthwiseConv" begin
+    for kernel_width in [1, 3, 5],
+        kernel_height in [1, 2, 4],
+        in_channels in [1, 2],
+        out_channels in [1, 2]
+
+        kernel = rand(-9.0f0:9.0f0, kernel_height, kernel_width, 1, in_channels)
+
+        for height in [5, 6],
+            width in [5, 7]
+
+            test_input = rand(-9.0f0:9.0f0, height, width, in_channels, 1)
+            x = tensor(test_input, dev = 0)
+            w = tensor(kernel, dev = 0)
+
+            expected_output = NNlib.depthwiseconv(test_input, kernel, pad = (0,0), stride = (1,1 ), dilation = (1, 1), flipped = true)
+            test_output = NNlib.depthwiseconv(x, w, pad = (0,0), stride = (1,1 ), dilation = (1, 1))
+
+            test_output = Array(test_output)
+            @test maximum(abs.(test_output - expected_output)) < 10 * eps(Float32)
+        end
+    end
+end
+
+
+@testset "Conv with padding" begin
+    for kernel_width in [1, 2, 3, 5],
+        kernel_height in [1, 2, 3, 5],
+        in_channels in [1, 2],
+        out_channels in [1, 2]
+
+        num_coefficients = (kernel_width * kernel_height * in_channels * out_channels)
+        kernel = reshape(1.0f0:num_coefficients, kernel_height, kernel_width, in_channels, out_channels)
+        kernel = collect(kernel)
+        pad = size(kernel)[1:2] .÷ 2
+
+        for height in [1, 2, 3, 4],
+            width in [1, 2, 3, 5]
+
+            test_input = zeros(Float32, height, width, in_channels, 1)
+            test_input[(height + 1) ÷ 2, (width + 1) ÷ 2, 1, 1] = 1
+            x = tensor(test_input, dev = 0)
+            w = tensor(kernel, dev = 0)
+
+            cdims = NNlib.DenseConvDims(size(test_input),
+                                        size(kernel),
+                                        stride=(1, 1),
+                                        padding=pad,
+                                        dilation=(1, 1),
+                                        flipkernel = true)
+
+            expected_output = NNlib.conv(test_input, kernel, cdims)
+            test_output     = NNlib.conv(x,          w,      cdims)
+
+            test_output = Array(test_output)
+            @test maximum(abs.(test_output - expected_output)) < 10 * eps(Float32)
+        end
+    end
+end
+
+
+@testset "Conv with stride" begin
+    for kernel_width in [1, 3, 4],
+        kernel_height in [1, 2, 5],
+        in_channels in [1],
+        out_channels in [1],
+        row_stride in [1, 2, 4],
+        column_stride in [1, 3, 5]
+
+        kernel = fill(1.0f0, kernel_height, kernel_width, in_channels, out_channels)
+        kernel = collect(kernel)
+
+        for height in 13:(13 + row_stride - 1),
+            width in 15:(15 + column_stride - 1)
+
+            sz_in = [height, width, in_channels, 1]
+            test_input = reshape(1.0f0:prod(sz_in), height, width, in_channels, 1)
+            test_input = collect(test_input)
+            x = tensor(test_input, dev = 0)
+            w = tensor(kernel, dev = 0)
+
+            cdims = NNlib.DenseConvDims(size(test_input),
+                                        size(kernel),
+                                        stride=(row_stride, column_stride),
+                                        padding=(0, 0),
+                                        dilation=(1, 1),
+                                        flipkernel = true)
+
+            expected_output = NNlib.conv(test_input, kernel, cdims)
+            test_output     = NNlib.conv(x,          w,      cdims)
+
+            test_output = Array(test_output)
+            @test maximum(abs.(test_output - expected_output)) < 10 * eps(Float32)
+        end
+    end
+end
+
+
+@testset "Conv with dilation" begin
+    for kernel_width in 1,
+        kernel_height in 1:9,
+        in_channels in 1,
+        out_channels in 1,
+        row_stride in 1:11,
+        column_stride in 1,
+        row_rate in 1:4,
+        column_rate in 1
+
+        if kernel_height * row_rate > 13
+            continue
+        end
+
+        kernel = fill(1.0f0, kernel_height, kernel_width, in_channels, out_channels)
+        kernel = collect(kernel)
+
+        for height in 13:(13 + row_stride - 1),
+            width in [1]
+
+            sz_in = [height, width, in_channels, 1]
+            test_input = reshape(1.0f0:prod(sz_in), height, width, in_channels, 1)
+            test_input = collect(test_input)
+            x = tensor(test_input, dev = 0)
+            w = tensor(kernel, dev = 0)
+
+            cdims = NNlib.DenseConvDims(size(test_input),
+                                        size(kernel),
+                                        stride=(row_stride, column_stride),
+                                        padding=(0, 0),
+                                        dilation=(1, 1),
+                                        flipkernel = true)
+
+            expected_output = NNlib.conv(test_input, kernel, cdims)
+            test_output     = NNlib.conv(x,          w,      cdims)
+
+            test_output = Array(test_output)
+            @test maximum(abs.(test_output - expected_output)) < 10 * eps(Float32)
+        end
+    end
+end
+
+
+@testset "Pooling" begin
+    for fn in (NNlib.maxpool, NNlib.meanpool),
+        column_span in 1:3,
+        row_span in 1:3,
+        column_stride in 1:3,
+        row_stride in 1:3,
+        pad in (false, true)
+
+        if pad
+            padding = (row_span, column_span) .÷ 2
+        else
+            padding = (0, 0)
+        end
+
+        for height in (1:2) * row_span * row_stride,
+            width in (1:2) * column_span * column_stride,
+            channels in 1:2
+
+            test_input = rand(0.0f0:9.0f0, height, width, channels, 1)
+            x = tensor(test_input, dev = 0)
+
+            pdims = NNlib.PoolDims(size(test_input),
+                                   (row_span, column_span),
+                                   padding=padding,
+                                   stride=(row_stride, column_stride))
+
+            expected_output = fn(test_input, pdims)
+            test_output     = fn(x,          pdims)
+
+            test_output = Array(test_output)
+            @test maximum(abs.(test_output - expected_output)) < 10 * eps(Float32)
+        end
+    end
+end
+
+
+@testset "Activations" begin
+    for fn in (NNlib.relu, NNlib.tanh, NNlib.sigmoid, NNlib.leakyrelu, NNlib.softmax),
+        height in [1, 2, 3, 4, 7],
+        width in [1, 2, 3, 5, 6],
+        channels in 1:3
+
+        test_input = rand(-9.0f0:9.0f0, height, width, channels, 1)
+        x = tensor(test_input, dev = 0)
+
+        if fn == NNlib.softmax
+            expected_output = fn(test_input, dims = 3)
+            test_output     = fn(x, dims = 3)
+        else
+            expected_output = fn.(test_input)
+            test_output     = fn(x)
+        end
+
+        test_output = Array(test_output)
+        @test maximum(abs.(test_output - expected_output)) < 10 * eps(Float32)
+    end
+end