diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
index 364fd1d7de6a..55e5f38ffa59 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -59,9 +59,6 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_CPU_PRIORITY_NTHREADS
   - Values: Int ```(default=4)```
   - The number of threads given to prioritized CPU jobs.
-* MXNET_CPU_NNPACK_NTHREADS
-  - Values: Int ```(default=4)```
-  - The number of threads used for NNPACK. NNPACK package aims to provide high-performance implementations of some layers for multi-core CPUs. Checkout [NNPACK]({{'/api/faq/nnpack'|relative_url}}) to know more about it.
 * MXNET_MP_WORKER_NTHREADS
   - Values: Int ```(default=1)```
   - The number of scheduling threads on CPU given to multiprocess workers. Enlarge this number allows more operators to run in parallel in individual workers but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
diff --git a/docs/static_site/src/pages/api/faq/nnpack.md b/docs/static_site/src/pages/api/faq/nnpack.md
deleted file mode 100644
index 84bedee6052c..000000000000
--- a/docs/static_site/src/pages/api/faq/nnpack.md
+++ /dev/null
@@ -1,162 +0,0 @@
----
-layout: page_category
-title: NNPACK for Multi-Core CPU Support in MXNet
-category: faq
-faq_c: Speed
-question: Can I use nnpack to improve the CPU performance of MXNet?
-permalink: /api/faq/nnpack
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-### NNPACK for Multi-Core CPU Support in MXNet
-[NNPACK](https://github.com/Maratyszcza/NNPACK) is an acceleration package
-for neural network computations, which can run on x86-64, ARMv7, or ARM64 architecture CPUs.
-Using NNPACK, higher-level libraries like _MXNet_ can speed up
-the execution on multi-core CPU computers, including laptops and mobile devices.
-
-_MXNet_ supports NNPACK for forward propagation (inference only) in convolution, max-pooling, and fully-connected layers.
-In this document, we give a high level overview of how to use NNPACK with _MXNet_.
-
-
-### Conditions
-The underlying implementation of NNPACK utilizes several acceleration methods,
-including [fft](https://arxiv.org/abs/1312.5851) and [winograd](https://arxiv.org/abs/1509.09308).
-These algorithms work better on some special `batch size`, `kernel size`, and `stride` settings than on other,
-so depending on the context, not all convolution, max-pooling, or fully-connected layers can be powered by NNPACK.
-When favorable conditions for running NNPACKS are not met,
-_MXNet_ will fall back to the default implementation automatically.
-
-NNPACK only supports Linux and OS X systems. Windows is not supported at present.
-The following table explains under which conditions NNPACK will work.
-
-| operation      | conditions |
-|:---------      |:---------- |
-|convolution     |2d convolution `and` no-bias=False `and` dilate=(1,1) `and` num_group=1 `and` batch-size = 1 or batch-size > 1 && stride = (1,1);|
-|pooling         | max-pooling `and` kernel=(2,2) `and` stride=(2,2) `and` pooling_convention=full    |
-|fully-connected| without any restrictions |
-
-### Build/Install NNPACK with MXNet
-
-If the trained model meets some conditions of using NNPACK,
-you can build MXNet with NNPACK support.
-Follow these simple steps:
-* Build NNPACK shared library with the following commands. _MXNet_ will link NNPACK dynamically.
-
-Note: The following NNPACK installation instructions have been tested on Ubuntu 14.04 and 16.04.
-
-```bash
-# Install Pip
-$ sudo apt-get update
-$ sudo apt-get install -y python-pip
-$ sudo pip install --upgrade pip
-
-# Install Peach
-$ git clone https://github.com/Maratyszcza/PeachPy.git
-$ cd PeachPy
-$ sudo pip install --upgrade -r requirements.txt
-$ python setup.py generate
-$ sudo pip install --upgrade .
-
-# Install Ninja Build System
-$ sudo apt-get install ninja-build
-$ pip install ninja-syntax
-
-# Build NNPack shared library
-$ cd ~
-$ git clone --recursive https://github.com/Maratyszcza/NNPACK.git
-$ cd NNPACK
-# Latest NNPACK do not support building NNPACK as shared library using --enable-shared flag
-# Reset to commit that supports it.
-$ git reset --hard 9c6747d7b80051b40e6f92d6828e2ed997529cd2
-$ git submodule init && git submodule update --recursive
-$ python ./configure.py --enable-shared
-$ ninja
-$ cd ~
-
-```
-
-* Set lib path of NNPACK as the environment variable, e.g. `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$YOUR_NNPACK_INSTALL_PATH/lib`
-* Add the include file of NNPACK and its third-party to  `ADD_CFLAGS` in config.mk, e.g. `ADD_CFLAGS = -I$(YOUR_NNPACK_INSTALL_PATH)/include/ -I$(YOUR_NNPACK_INSTALL_PATH)/third-party/pthreadpool/include/`
-* Set `USE_NNPACK = 1` in config.mk.
-* Build MXNet from source following the [install guide]({{'/get_started'|relative_url}}).
-
-### NNPACK Performance
-
-Though not all convolutional, pooling, and fully-connected layers can make full use of NNPACK,
-for some popular models it provides significant speedups. These include the most popular image recognition networks: Alexnet, VGG, and Inception-bn.
-
-To benchmark NNPACK, we use `example/image-classification/benchmark_score.py`(changed with  more range of batch-size). We use CPU e5-2670, MXNET_CPU_NNPACK_NTHREADS=4.
-
-build MXNet without NNPACK, the log is:
-```
-INFO:root:network: alexnet
-INFO:root:device: cpu(0)
-INFO:root:batch size  1, image/sec: 6.389429
-INFO:root:batch size  2, image/sec: 7.961457
-INFO:root:batch size  4, image/sec: 8.950112
-INFO:root:batch size  8, image/sec: 9.578176
-INFO:root:batch size 16, image/sec: 9.701248
-INFO:root:batch size 32, image/sec: 9.839940
-INFO:root:batch size 64, image/sec: 10.075369
-INFO:root:batch size 128, image/sec: 10.053556
-INFO:root:batch size 256, image/sec: 9.972228
-INFO:root:network: vgg
-INFO:root:device: cpu(0)
-INFO:root:batch size  1, image/sec: 1.223822
-INFO:root:batch size  2, image/sec: 1.322814
-INFO:root:batch size  4, image/sec: 1.383586
-INFO:root:batch size  8, image/sec: 1.402376
-INFO:root:batch size 16, image/sec: 1.415972
-INFO:root:batch size 32, image/sec: 1.428377
-INFO:root:batch size 64, image/sec: 1.443987
-INFO:root:batch size 128, image/sec: 1.427531
-INFO:root:batch size 256, image/sec: 1.435279
-```
-
-build MXNet with NNPACK, log is:
-
-```
-INFO:root:network: alexnet
-INFO:root:device: cpu(0)
-INFO:root:batch size  1, image/sec: 19.027215
-INFO:root:batch size  2, image/sec: 12.879975
-INFO:root:batch size  4, image/sec: 17.424076
-INFO:root:batch size  8, image/sec: 21.283966
-INFO:root:batch size 16, image/sec: 24.469325
-INFO:root:batch size 32, image/sec: 25.910348
-INFO:root:batch size 64, image/sec: 27.441672
-INFO:root:batch size 128, image/sec: 28.009156
-INFO:root:batch size 256, image/sec: 28.918950
-INFO:root:network: vgg
-INFO:root:device: cpu(0)
-INFO:root:batch size  1, image/sec: 3.980907
-INFO:root:batch size  2, image/sec: 2.392069
-INFO:root:batch size  4, image/sec: 3.610553
-INFO:root:batch size  8, image/sec: 4.994450
-INFO:root:batch size 16, image/sec: 6.396612
-INFO:root:batch size 32, image/sec: 7.614288
-INFO:root:batch size 64, image/sec: 8.826084
-INFO:root:batch size 128, image/sec: 9.193653
-INFO:root:batch size 256, image/sec: 9.991472
-```
-
-The results show that NNPACK can confer a speedup of about 2X~7X as compared to the original _MXNet_ CPU implementation.
-
-### Tips
-
-NNPACK aims to provide high-performance implementations of some layers for multi-core CPUs, so you can easily set the thread number by changing the environmental variable `MXNET_CPU_NNPACK_NTHREADS`. However, we found that the performance is not proportional to the number of threads, and suggest using 4~8 threads when using NNPACK.
diff --git a/src/operator/convolution_v1.cc b/src/operator/convolution_v1.cc
index 723dc867f52f..5d1ce3108a3f 100644
--- a/src/operator/convolution_v1.cc
+++ b/src/operator/convolution_v1.cc
@@ -25,10 +25,6 @@
 */
 
 #include "./convolution_v1-inl.h"
-#if MXNET_USE_NNPACK == 1
-#include "./nnpack/nnpack_convolution-inl.h"
-#endif  // MXNET_USE_NNPACK
-
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(ConvolutionV1Param);
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 3ebb67ad0aa0..05d4cb74318b 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -27,9 +27,6 @@
 #include "./convolution-inl.h"
 #include "../elemwise_op_common.h"
 #include "../operator_common.h"
-#if MXNET_USE_NNPACK == 1
-#include "../nnpack/nnpack_pooling-inl.h"
-#endif  // MXNET_USE_NNPACK
 #if MXNET_USE_MKLDNN == 1
 #include "./mkldnn/mkldnn_base-inl.h"
 #include "./mkldnn/mkldnn_ops-inl.h"
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 1322c86f9e47..7b243f1b2eb2 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -25,9 +25,6 @@
 #include "./fully_connected-inl.h"
 #include "./mkldnn/mkldnn_ops-inl.h"
 #include "./mkldnn/mkldnn_base-inl.h"
-#if MXNET_USE_NNPACK == 1
-#include "../nnpack/nnpack_fully_connected-inl.h"
-#endif  // MXNET_USE_NNPACK
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 56edf74ee67a..4c66f2c90eec 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -25,9 +25,6 @@
 */
 #include "../elemwise_op_common.h"
 #include "./pooling-inl.h"
-#if MXNET_USE_NNPACK == 1
-#include "../nnpack/nnpack_pooling-inl.h"
-#endif  // MXNET_USE_NNPACK
 #if MXNET_USE_MKLDNN == 1
 #include "./mkldnn/mkldnn_pooling-inl.h"
 #include "./mkldnn/mkldnn_base-inl.h"
diff --git a/src/operator/nnpack/nnpack_convolution-inl.h b/src/operator/nnpack/nnpack_convolution-inl.h
deleted file mode 100644
index 0e2c73693d15..000000000000
--- a/src/operator/nnpack/nnpack_convolution-inl.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2016 by Contributors
- * \file nnpack_convolution-inl.h
- * \brief
- * \author Carwin
-*/
-#ifndef MXNET_OPERATOR_NNPACK_NNPACK_CONVOLUTION_INL_H_
-#define MXNET_OPERATOR_NNPACK_NNPACK_CONVOLUTION_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../convolution-inl.h"
-#include "nnpack.h"
-#include "nnpack_util.h"
-
-namespace mxnet {
-namespace op {
-
-template <typename xpu, typename DType>
-class NNPACKConvolutionOp : public ConvolutionOp<xpu, DType> {
- private:
-  ConvolutionParam param_;
-
- public:
-  explicit NNPACKConvolutionOp(ConvolutionParam p)
-      : ConvolutionOp<xpu, DType>(p) {
-    this->param_ = p;
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data = in_data[conv::kData].get<xpu, 4, DType>(s);
-    const size_t batch_size = data.shape_[0];
-    const size_t input_c = data.shape_[1];
-    const size_t input_h = data.shape_[2];
-    const size_t input_w = data.shape_[3];
-    Shape<3> wmat_shape =
-        Shape3(param_.num_group, param_.num_filter / param_.num_group,
-               input_c / param_.num_group * param_.kernel[0] *
-                   param_.kernel[1]);
-    Tensor<xpu, 3, DType> wmat =
-        in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-    Tensor<xpu, 4, DType> out = out_data[conv::kOut].get<xpu, 4, DType>(s);
-    nnp_size input_size = {input_w, input_h};
-    nnp_padding input_padding = {param_.pad[0], param_.pad[1], param_.pad[0],
-                               param_.pad[1]};
-    nnp_size kernel_size = {param_.kernel[1], param_.kernel[0]};
-    nnp_size output_subsampling = {param_.stride[1], param_.stride[0]};
-    Tensor<xpu, 1, DType> bias = in_data[conv::kBias].get<xpu, 1, DType>(s);
-
-    nnp_convolution_algorithm algorithm = nnp_convolution_algorithm_auto;
-    nnp_convolution_transform_strategy kts = nnp_convolution_transform_strategy_tuple_based;
-    nnp_status status = nnp_status_success;
-    if (batch_size == 1) {
-      status = nnp_convolution_inference(
-      algorithm,                    // enum nnp_convolution_algorithm,
-      kts,                          // enum nnp_convolution_transform_strategy,
-      input_c,                      // size_t input_channels,
-      param_.num_filter,            // size_t output_channels,
-      input_size,                   // struct nnp_size input_size,
-      input_padding,                // struct nnp_padding input_padding,
-      kernel_size,                  // struct nnp_size kernel_size,
-      output_subsampling,           // struct nnp_size output_subsampling,
-      data.dptr_,                   // const float input[],
-      wmat.dptr_,                   // const float kernel[],
-      bias.dptr_,                   // const float bias[],
-      out.dptr_,                    // float output[],
-      nnpackinitialize.threadpool,  // pthreadpool_t threadpool,
-      nullptr);
-    } else {
-      status = nnp_convolution_output(
-      algorithm,                    // enum nnp_convolution_algorithm algorithm,
-      batch_size,                   // size_t batch size of input tensor
-      input_c,                      // size_t input_channels,
-      param_.num_filter,            // size_t output_channels,
-      input_size,                   // struct nnp_size input_size,
-      input_padding,                // struct nnp_padding input_padding,
-      kernel_size,                  // struct nnp_size kernel_size,
-      data.dptr_,                   // const float input[],
-      wmat.dptr_,                   // const float kernel[],
-      bias.dptr_,                   // const float bias[],
-      out.dptr_,                    // float output[],
-      nnpackinitialize.threadpool,  // pthreadpool_t threadpool,
-      nullptr);
-    }
-    if (nnp_status_success != status) {
-      LOG(FATAL) << "nnpack convolution feedforward failed status=" << status;
-    }
-  }
-};  // class NNPACKConvolutionOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_NNPACK_NNPACK_CONVOLUTION_INL_H_
diff --git a/src/operator/nnpack/nnpack_fully_connected-inl.h b/src/operator/nnpack/nnpack_fully_connected-inl.h
deleted file mode 100644
index 422334949c48..000000000000
--- a/src/operator/nnpack/nnpack_fully_connected-inl.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2016 by Contributors
- * \file nnpack_fully_connected-inl.h
- * \brief
- * \author Wei Wu
-*/
-#ifndef MXNET_OPERATOR_NNPACK_NNPACK_FULLY_CONNECTED_INL_H_
-#define MXNET_OPERATOR_NNPACK_NNPACK_FULLY_CONNECTED_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../nn/fully_connected-inl.h"
-#include "nnpack.h"
-#include "nnpack_util.h"
-
-namespace mxnet {
-namespace op {
-
-template <typename xpu, typename DType>
-class NNPACKFullyConnectedOp : public FullyConnectedOp<xpu, DType> {
- private:
-  FullyConnectedParam param_;
-
- public:
-  explicit NNPACKFullyConnectedOp(FullyConnectedParam p)
-      : FullyConnectedOp<xpu, DType>(p) {
-    this->param_ = p;
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    if (req[fullc::kOut] == kNullOp) return;
-    CHECK_EQ(req[fullc::kOut], kWriteTo);
-    size_t expected = param_.no_bias ? 2 : 3;
-    CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1);
-    const mxnet::TShape& ishape = in_data[fullc::kData].shape_;
-    const mxnet::TShape& oshape = out_data[fullc::kOut].shape_;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-    Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
-    Tensor<xpu, 2, DType> out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
-        Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
-    const size_t batch_size = data.shape_[0];
-    const size_t input_c = data.shape_[1];
-    nnp_status status = nnp_status_success;
-    if (batch_size == 1) {
-      status = nnp_fully_connected_inference(
-      input_c,                       // size_t input_channels,
-      param_.num_hidden,             // size_t output_channels,
-      data.dptr_,                    // const float input[],
-      wmat.dptr_,                    // const float kernel[],
-      out.dptr_,                     // float output[],
-      nnpackinitialize.threadpool);  // pthreadpool_t threadpool,
-    } else {
-      status = nnp_fully_connected_output(
-      batch_size,                    // size_t batch size of input tensor
-      input_c,                       // size_t input_channels,
-      param_.num_hidden,             // size_t output_channels,
-      data.dptr_,                    // const float input[],
-      wmat.dptr_,                    // const float kernel[],
-      out.dptr_,                     // float output[],
-      nnpackinitialize.threadpool,   // pthreadpool_t threadpool,
-      nullptr);
-    }
-    if (nnp_status_success != status) {
-      LOG(FATAL) << "nnpack fully conneted feedforward failed status=" << status;
-    }
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
-      out += repmat(bias, data.size(0));
-    }
-  }
-};  // class NNPACKFullyConnectedOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_NNPACK_NNPACK_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/nnpack/nnpack_pooling-inl.h b/src/operator/nnpack/nnpack_pooling-inl.h
deleted file mode 100644
index 3fad77024a9a..000000000000
--- a/src/operator/nnpack/nnpack_pooling-inl.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2016 by Contributors
- * \file nnpack_pooling-inl.h
- * \brief
- * \author Wei Wu
-*/
-#ifndef MXNET_OPERATOR_NNPACK_NNPACK_POOLING_INL_H_
-#define MXNET_OPERATOR_NNPACK_NNPACK_POOLING_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../nn/pooling-inl.h"
-#include "nnpack.h"
-#include "nnpack_util.h"
-
-namespace mxnet {
-namespace op {
-
-template <typename xpu, typename DType>
-class NNPACKPoolingOp : public PoolingOp<xpu, DType> {
- private:
-  PoolingParam param_;
-
- public:
-  explicit NNPACKPoolingOp(PoolingParam p)
-      : PoolingOp<xpu, DType>(p) {
-    this->param_ = p;
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data = in_data[pool_enum::kData].get<xpu, 4, DType>(s);
-    const size_t batch_size = data.shape_[0];
-    const size_t input_c = data.shape_[1];
-    const size_t input_h = data.shape_[2];
-    const size_t input_w = data.shape_[3];
-    Tensor<xpu, 4, DType> out = out_data[pool_enum::kOut].get<xpu, 4, DType>(s);
-    nnp_size input_size = {input_w, input_h};
-    nnp_padding input_padding = {param_.pad[0], param_.pad[1], param_.pad[0],
-                                 param_.pad[1]};
-    nnp_size kernel_size = {param_.kernel[1], param_.kernel[0]};
-    nnp_size output_subsampling = {param_.stride[1], param_.stride[0]};
-    nnp_status status = nnp_max_pooling_output(
-      batch_size,                    // size_t batch size of input tensor
-      input_c,                       // size_t input_channels,
-      input_size,                    // struct nnp_size input_size,
-      input_padding,                 // struct nnp_padding input_padding,
-      kernel_size,                   // struct nnp_size kernel_size,
-      output_subsampling,            // struct nnp_size output_subsampling,
-      data.dptr_,                    // const float input[],
-      out.dptr_,                     // float output[],
-      nnpackinitialize.threadpool);  // pthreadpool_t threadpool,
-    if (nnp_status_success != status) {
-      LOG(FATAL) << "nnpack max pooling feedforward failed status=" << status;
-    }
-  }
-};  // class NNPACKPoolingOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_NNPACK_NNPACK_POOLING_INL_H_
diff --git a/src/operator/nnpack/nnpack_util.cc b/src/operator/nnpack/nnpack_util.cc
deleted file mode 100644
index 7d075e0554ba..000000000000
--- a/src/operator/nnpack/nnpack_util.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2016 by Contributors
- * \file nnpack_util.cc
- * \brief
- * \author Wei Wu
-*/
-
-#if MXNET_USE_NNPACK == 1
-#include "nnpack_util.h"
-
-namespace mxnet {
-namespace op {
-
-NNPACKInitialize nnpackinitialize;
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_NNPACK
diff --git a/src/operator/nnpack/nnpack_util.h b/src/operator/nnpack/nnpack_util.h
deleted file mode 100644
index 2edfb79ad46e..000000000000
--- a/src/operator/nnpack/nnpack_util.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2016 by Contributors
- * \file nnpack_util.h
- * \brief
- * \author Carwin
-*/
-#ifndef MXNET_OPERATOR_NNPACK_NNPACK_UTIL_H_
-#define MXNET_OPERATOR_NNPACK_NNPACK_UTIL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <nnpack.h>
-
-namespace mxnet {
-namespace op {
-
-class NNPACKInitialize {
- public:
-  pthreadpool_t threadpool;
-
- public:
-  NNPACKInitialize() {
-    nnp_status status = nnp_initialize();
-    if (nnp_status_success != status) {
-      LOG(FATAL) << "nnp_initialize failed status=" << status;
-    }
-    int num_threads = dmlc::GetEnv("MXNET_CPU_NNPACK_NTHREADS", 4);
-    this->threadpool = pthreadpool_create(num_threads);
-  }
-  virtual ~NNPACKInitialize() {
-    nnp_status status = nnp_deinitialize();
-    if (nnp_status_success != status) {
-      LOG(FATAL) << "nnp_deinitialize failed status=" << status;
-    }
-    pthreadpool_destroy(threadpool);
-  }
-};
-
-// nnpackinitialize will be used in all other nnpack op
-extern NNPACKInitialize nnpackinitialize;
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_NNPACK_NNPACK_UTIL_H_