diff --git a/docs/api/python/gluon/contrib.md b/docs/api/python/gluon/contrib.md
index 790f6b496516..a940f697de69 100644
--- a/docs/api/python/gluon/contrib.md
+++ b/docs/api/python/gluon/contrib.md
@@ -59,6 +59,17 @@ In the rest of this document, we list routines provided by the `gluon.contrib` p
     PixelShuffle3D
 ```
 
+### Convolutional neural network
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.contrib.cnn
+
+.. autosummary::
+    :nosignatures:
+    
+    DeformableConvolution
+```
+
 ### Recurrent neural network
 
 ```eval_rst
@@ -116,6 +127,10 @@ In the rest of this document, we list routines provided by the `gluon.contrib` p
 .. automodule:: mxnet.gluon.contrib.nn
     :members:
     :imported-members:
+    
+.. automodule:: mxnet.gluon.contrib.cnn
+    :members:
+    :imported-members:
 
 .. automodule:: mxnet.gluon.contrib.rnn
     :members:
diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py
index f708fb900227..83be8a39ba32 100644
--- a/python/mxnet/gluon/contrib/__init__.py
+++ b/python/mxnet/gluon/contrib/__init__.py
@@ -22,4 +22,6 @@
 
 from . import rnn
 
+from . import cnn
+
 from . import data
diff --git a/python/mxnet/gluon/contrib/cnn/__init__.py b/python/mxnet/gluon/contrib/cnn/__init__.py
new file mode 100644
index 000000000000..501b9ea829b8
--- /dev/null
+++ b/python/mxnet/gluon/contrib/cnn/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import
+"""Contrib convolutional neural network module."""
+
+from . import conv_layers
+from .conv_layers import *
+
+__all__ = conv_layers.__all__
diff --git a/python/mxnet/gluon/contrib/cnn/conv_layers.py b/python/mxnet/gluon/contrib/cnn/conv_layers.py
new file mode 100644
index 000000000000..9dd208702932
--- /dev/null
+++ b/python/mxnet/gluon/contrib/cnn/conv_layers.py
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ
+"""Custom convolutional neural network layers in model_zoo."""
+
+__all__ = ['DeformableConvolution']
+
+from .... import symbol
+from ...block import HybridBlock
+from ....base import numeric_types
+from ...nn import Activation
+
+class DeformableConvolution(HybridBlock):
+    """2-D Deformable Convolution v_1 (Dai, 2017).
+    Normal Convolution uses sampling points in a regular grid, while the sampling
+    points of Deformablem Convolution can be offset. The offset is learned with a
+    separate convolution layer during the training. Both the convolution layer for
+    generating the output features and the offsets are included in this gluon layer.
+
+    Parameters
+    ----------
+    channels : int,
+        The dimensionality of the output space
+        i.e. the number of output channels in the convolution.
+    kernel_size : int or tuple/list of 2 ints, (Default value = (1,1))
+        Specifies the dimensions of the convolution window.
+    strides : int or tuple/list of 2 ints, (Default value = (1,1))
+        Specifies the strides of the convolution.
+    padding : int or tuple/list of 2 ints, (Default value = (0,0))
+        If padding is non-zero, then the input is implicitly zero-padded
+        on both sides for padding number of points.
+    dilation : int or tuple/list of 2 ints, (Default value = (1,1))
+        Specifies the dilation rate to use for dilated convolution.
+    groups : int, (Default value = 1)
+        Controls the connections between inputs and outputs.
+        At groups=1, all inputs are convolved to all outputs.
+        At groups=2, the operation becomes equivalent to having two convolution
+        layers side by side, each seeing half the input channels, and producing
+        half the output channels, and both subsequently concatenated.
+    num_deformable_group : int, (Default value = 1)
+        Number of deformable group partitions.
+    layout : str, (Default value = NCHW)
+        Dimension ordering of data and weight. Can be 'NCW', 'NWC', 'NCHW',
+        'NHWC', 'NCDHW', 'NDHWC', etc. 'N', 'C', 'H', 'W', 'D' stands for
+        batch, channel, height, width and depth dimensions respectively.
+        Convolution is performed over 'D', 'H', and 'W' dimensions.
+    use_bias : bool, (Default value = True)
+        Whether the layer for generating the output features uses a bias vector.
+    in_channels : int, (Default value = 0)
+        The number of input channels to this layer. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and input channels will be inferred from the shape of input data.
+    activation : str, (Default value = None)
+        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    weight_initializer : str or `Initializer`, (Default value = None)
+        Initializer for the `weight` weights matrix for the convolution layer
+        for generating the output features.
+    bias_initializer : str or `Initializer`, (Default value = zeros)
+        Initializer for the bias vector for the convolution layer
+        for generating the output features.
+    offset_weight_initializer : str or `Initializer`, (Default value = zeros)
+        Initializer for the `weight` weights matrix for the convolution layer
+        for generating the offset.
+    offset_bias_initializer : str or `Initializer`, (Default value = zeros),
+        Initializer for the bias vector for the convolution layer
+        for generating the offset.
+    offset_use_bias: bool, (Default value = True)
+        Whether the layer for generating the offset uses a bias vector.
+
+    Inputs:
+        - **data**: 4D input tensor with shape
+          `(batch_size, in_channels, height, width)` when `layout` is `NCHW`.
+          For other layouts shape is permuted accordingly.
+
+    Outputs:
+        - **out**: 4D output tensor with shape
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCHW`.
+          out_height and out_width are calculated as::
+
+              out_height = floor((height+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
+              out_width = floor((width+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
+    """
+
+    def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1,
+                 num_deformable_group=1, layout='NCHW', use_bias=True, in_channels=0, activation=None,
+                 weight_initializer=None, bias_initializer='zeros',
+                 offset_weight_initializer='zeros', offset_bias_initializer='zeros', offset_use_bias=True,
+                 op_name='DeformableConvolution', adj=None, prefix=None, params=None):
+        super(DeformableConvolution, self).__init__(prefix=prefix, params=params)
+        with self.name_scope():
+            self._channels = channels
+            self._in_channels = in_channels
+
+            assert layout in ('NCHW', 'NHWC'), "Only supports 'NCHW' and 'NHWC' layout for now"
+            if isinstance(kernel_size, numeric_types):
+                kernel_size = (kernel_size,) * 2
+            if isinstance(strides, numeric_types):
+                strides = (strides,) * len(kernel_size)
+            if isinstance(padding, numeric_types):
+                padding = (padding,) * len(kernel_size)
+            if isinstance(dilation, numeric_types):
+                dilation = (dilation,) * len(kernel_size)
+            self._op_name = op_name
+
+            offset_channels = 2 * kernel_size[0] * kernel_size[1] * num_deformable_group
+            self._kwargs_offset = {
+                'kernel': kernel_size, 'stride': strides, 'dilate': dilation,
+                'pad': padding, 'num_filter': offset_channels, 'num_group': groups,
+                'no_bias': not offset_use_bias, 'layout': layout}
+
+            self._kwargs_deformable_conv = {
+                'kernel': kernel_size, 'stride': strides, 'dilate': dilation,
+                'pad': padding, 'num_filter': channels, 'num_group': groups,
+                'num_deformable_group': num_deformable_group,
+                'no_bias': not use_bias, 'layout': layout}
+
+            if adj:
+                self._kwargs_offset['adj'] = adj
+                self._kwargs_deformable_conv['adj'] = adj
+
+            dshape = [0] * (len(kernel_size) + 2)
+            dshape[layout.find('N')] = 1
+            dshape[layout.find('C')] = in_channels
+
+            op = getattr(symbol, 'Convolution')
+            offset = op(symbol.var('data', shape=dshape), **self._kwargs_offset)
+
+            offsetshapes = offset.infer_shape_partial()[0]
+
+            self.offset_weight = self.params.get('offset_weight', shape=offsetshapes[1],
+                                                 init=offset_weight_initializer,
+                                                 allow_deferred_init=True)
+
+            if offset_use_bias:
+                self.offset_bias = self.params.get('offset_bias', shape=offsetshapes[2],
+                                                   init=offset_bias_initializer,
+                                                   allow_deferred_init=True)
+            else:
+                self.offset_bias = None
+
+            deformable_conv_weight_shape = [0] * (len(kernel_size) + 2)
+            deformable_conv_weight_shape[0] = channels
+            deformable_conv_weight_shape[2] = kernel_size[0]
+            deformable_conv_weight_shape[3] = kernel_size[1]
+
+            self.deformable_conv_weight = self.params.get('deformable_conv_weight',
+                                                          shape=deformable_conv_weight_shape,
+                                                          init=weight_initializer,
+                                                          allow_deferred_init=True)
+
+            if use_bias:
+                self.deformable_conv_bias = self.params.get('deformable_conv_bias', shape=(channels,),
+                                                            init=bias_initializer,
+                                                            allow_deferred_init=True)
+            else:
+                self.deformable_conv_bias = None
+
+            if activation:
+                self.act = Activation(activation, prefix=activation + '_')
+            else:
+                self.act = None
+
+    def hybrid_forward(self, F, x, offset_weight, deformable_conv_weight, offset_bias=None, deformable_conv_bias=None):
+        if offset_bias is None:
+            offset = F.Convolution(x, offset_weight, cudnn_off=True, **self._kwargs_offset)
+        else:
+            offset = F.Convolution(x, offset_weight, offset_bias, cudnn_off=True, **self._kwargs_offset)
+
+        if deformable_conv_bias is None:
+            act = F.contrib.DeformableConvolution(data=x, offset=offset, weight=deformable_conv_weight,
+                                                  name='fwd', **self._kwargs_deformable_conv)
+        else:
+            act = F.contrib.DeformableConvolution(data=x, offset=offset, weight=deformable_conv_weight,
+                                                  bias=deformable_conv_bias, name='fwd',
+                                                  **self._kwargs_deformable_conv)
+
+        if self.act:
+            act = self.act(act)
+        return act
+
+    def _alias(self):
+        return 'deformable_conv'
+
+    def __repr__(self):
+        s = '{name}({mapping}, kernel_size={kernel}, stride={stride}'
+        len_kernel_size = len(self._kwargs_deformable_conv['kernel'])
+        if self._kwargs_deformable_conv['pad'] != (0,) * len_kernel_size:
+            s += ', padding={pad}'
+        if self._kwargs_deformable_conv['dilate'] != (1,) * len_kernel_size:
+            s += ', dilation={dilate}'
+        if hasattr(self, 'out_pad') and self.out_pad != (0,) * len_kernel_size:
+            s += ', output_padding={out_pad}'.format(out_pad=self.out_pad)
+        if self._kwargs_deformable_conv['num_group'] != 1:
+            s += ', groups={num_group}'
+        if self.deformable_conv_bias is None:
+            s += ', bias=False'
+        if self.act:
+            s += ', {}'.format(self.act)
+        s += ')'
+        shape = self.deformable_conv_weight.shape
+        return s.format(name=self.__class__.__name__,
+                        mapping='{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0]),
+                        **self._kwargs_deformable_conv)
diff --git a/tests/python/gpu/test_gluon_contrib_gpu.py b/tests/python/gpu/test_gluon_contrib_gpu.py
new file mode 100644
index 000000000000..1d19d850dd8e
--- /dev/null
+++ b/tests/python/gpu/test_gluon_contrib_gpu.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests of the contrib APIs in Gluon only with gpu"""
+
+from __future__ import print_function
+import mxnet as mx
+from mxnet.gluon import nn
+from mxnet.gluon import contrib
+from mxnet.gluon.contrib.cnn import DeformableConvolution
+
+
+def test_DeformableConvolution():
+    """test of the deformable convolution layer with possible combinations of arguments,
+    currently this layer only supports gpu
+    """
+    net = nn.HybridSequential()
+    net.add(
+        DeformableConvolution(10, kernel_size=(3, 3), strides=1, padding=0),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, activation='relu',
+                               offset_use_bias=False, use_bias=False),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, activation='relu',
+                               offset_use_bias=False),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, activation='relu',
+                               use_bias=False),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, offset_use_bias=False, use_bias=False),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, offset_use_bias=False),
+        DeformableConvolution(12, kernel_size=(3, 2), strides=1, padding=0, use_bias=False),
+        DeformableConvolution(12, kernel_size=(3, 2), strides=1, padding=0, use_bias=False, num_deformable_group=4),
+    )
+
+    try:
+        ctx = mx.gpu()
+        _ = mx.nd.array([0], ctx=ctx)
+    except mx.base.MXNetError:
+        print("deformable_convolution only supports GPU")
+        return
+
+    net.initialize(force_reinit=True, ctx=ctx)
+    net.hybridize()
+
+    x = mx.nd.random.uniform(shape=(8, 5, 30, 31), ctx=ctx)
+    with mx.autograd.record():
+        y = net(x)
+        y.backward()
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()