diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index d73f86d7542..f7d26b62311 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -90,8 +90,6 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new ClipGradientComponent();
   } else if (component_type == "ElementwiseProductComponent") {
     ans = new ElementwiseProductComponent();
-  } else if (component_type == "Convolutional1dComponent") {
-    ans = new Convolutional1dComponent();
   } else if (component_type == "ConvolutionComponent") {
     ans = new ConvolutionComponent();
   } else if (component_type == "MaxpoolingComponent") {
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index c66d0235a93..3d7c56824e1 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -3552,645 +3552,176 @@ void ConvolutionComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
   bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
 }
 
-Convolutional1dComponent::Convolutional1dComponent():
-    UpdatableComponent(),
-    patch_dim_(0), patch_step_(0), patch_stride_(0), is_gradient_(false) {}
-
-Convolutional1dComponent::Convolutional1dComponent(const Convolutional1dComponent &component):
-    UpdatableComponent(component),
-    filter_params_(component.filter_params_),
-    bias_params_(component.bias_params_),
-    is_gradient_(component.is_gradient_) {}
-
-Convolutional1dComponent::Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
-                                                   const CuVectorBase<BaseFloat> &bias_params,
-                                                   BaseFloat learning_rate):
-    filter_params_(filter_params),
-    bias_params_(bias_params) {
-  SetLearningRate(learning_rate);
-  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
-               bias_params.Dim() != 0);
-  is_gradient_ = false;
-}
-
 // aquire input dim
-int32 Convolutional1dComponent::InputDim() const {
-  int32 filter_dim = filter_params_.NumCols();
-  int32 num_splice = filter_dim / patch_dim_;
-  return patch_stride_ * num_splice;
+int32 MaxpoolingComponent::InputDim() const {
+  return input_x_dim_ * input_y_dim_ * input_z_dim_;
 }
 
 // aquire output dim
-int32 Convolutional1dComponent::OutputDim() const {
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  return num_patches * num_filters;
-}
-
-// initialize the component using hyperparameters
-void Convolutional1dComponent::Init(int32 input_dim, int32 output_dim,
-                                    int32 patch_dim, int32 patch_step, int32 patch_stride,
-                                    BaseFloat param_stddev, BaseFloat bias_stddev) {
-  patch_dim_ = patch_dim;
-  patch_step_ = patch_step;
-  patch_stride_ = patch_stride;
-  int32 num_splice = input_dim / patch_stride;
-  int32 filter_dim = num_splice * patch_dim;
-  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
-  int32 num_filters = output_dim / num_patches;
-  KALDI_ASSERT(input_dim % patch_stride == 0);
-  KALDI_ASSERT((patch_stride - patch_dim) % patch_step == 0);
-  KALDI_ASSERT(output_dim % num_patches == 0);
-
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
-  filter_params_.SetRandn();
-  filter_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-}
-
-// initialize the component using predefined matrix file
-void Convolutional1dComponent::Init(int32 patch_dim, int32 patch_step, int32 patch_stride,
-                                    std::string matrix_filename) {
-  patch_dim_ = patch_dim;
-  patch_step_ = patch_step;
-  patch_stride_ = patch_stride;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat);
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 filter_dim = mat.NumCols() - 1, num_filters = mat.NumRows();
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
-  bias_params_.CopyColFromMat(mat, filter_dim);
-}
-
-// resize the component, setting the parameters to zero, while
-// leaving any other configuration values the same
-void Convolutional1dComponent::Resize(int32 input_dim, int32 output_dim) {
-  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
-  int32 num_splice = input_dim / patch_stride_;
-  int32 filter_dim = num_splice * patch_dim_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = output_dim / num_patches;
-  KALDI_ASSERT(input_dim % patch_stride_ == 0);
-  KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0);
-  KALDI_ASSERT(output_dim % num_patches == 0);
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-}
-
-// display information about component
-std::string Convolutional1dComponent::Info() const {
-  std::ostringstream stream;
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 filter_dim = num_splice * patch_dim_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = OutputDim() / num_patches;
-
-  stream << UpdatableComponent::Info()
-         << ", num-splice=" << num_splice
-         << ", num-patches=" << num_patches
-         << ", num-filters=" << num_filters
-         << ", filter-dim=" << filter_dim;
-  PrintParameterStats(stream, "filter-params", filter_params_);
-  PrintParameterStats(stream, "bias-params", bias_params_, true);  
-  return stream.str();
+int32 MaxpoolingComponent::OutputDim() const {
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+  return num_pools_x * num_pools_y * num_pools_z;
+}
+
+// check the component parameters
+void MaxpoolingComponent::Check() const {
+  // sanity check of the max pooling parameters
+  KALDI_ASSERT(input_x_dim_ > 0);
+  KALDI_ASSERT(input_y_dim_ > 0);
+  KALDI_ASSERT(input_z_dim_ > 0);
+  KALDI_ASSERT(pool_x_size_ > 0);
+  KALDI_ASSERT(pool_y_size_ > 0);
+  KALDI_ASSERT(pool_z_size_ > 0);
+  KALDI_ASSERT(pool_x_step_ > 0);
+  KALDI_ASSERT(pool_y_step_ > 0);
+  KALDI_ASSERT(pool_z_step_ > 0);
+  KALDI_ASSERT(input_x_dim_ >= pool_x_size_);
+  KALDI_ASSERT(input_y_dim_ >= pool_y_size_);
+  KALDI_ASSERT(input_z_dim_ >= pool_z_size_);
+  KALDI_ASSERT(pool_x_size_ >= pool_x_step_);
+  KALDI_ASSERT(pool_y_size_ >= pool_y_step_);
+  KALDI_ASSERT(pool_z_size_ >= pool_z_step_);
+  KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_  == 0);
+  KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_  == 0);
+  KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_  == 0);
 }
 
 // initialize the component using configuration file
-void Convolutional1dComponent::InitFromConfig(ConfigLine *cfl) {
-  KALDI_WARN << "Convolutional1dComponent has been deprecated."
-             << " Please use ConvolutionComponent.";
+void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = true;
-  std::string matrix_filename;
-  int32 input_dim = -1, output_dim = -1;
-  int32 patch_dim = -1, patch_step = -1, patch_stride = -1;
-  InitLearningRatesFromConfig(cfl);
-  ok = ok && cfl->GetValue("patch-dim", &patch_dim);
-  ok = ok && cfl->GetValue("patch-step", &patch_step);
-  ok = ok && cfl->GetValue("patch-stride", &patch_stride);
-  if (cfl->GetValue("matrix", &matrix_filename)) {
-    // initialize from prefined parameter matrix
-    Init(patch_dim, patch_step, patch_stride, matrix_filename);
-    if (cfl->GetValue("input-dim", &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-               "input-dim mismatch vs. matrix.");
-    if (cfl->GetValue("output-dim", &output_dim))
-      KALDI_ASSERT(output_dim == OutputDim() &&
-               "output-dim mismatch vs. matrix.");
-  } else {
-    ok = ok && cfl->GetValue("input-dim", &input_dim);
-    ok = ok && cfl->GetValue("output-dim", &output_dim);
-    // initialize from configuration
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0;
-    cfl->GetValue("param-stddev", &param_stddev);
-    cfl->GetValue("bias-stddev", &bias_stddev);
-    Init(input_dim, output_dim, patch_dim, patch_step, patch_stride,
-         param_stddev, bias_stddev);
-  }
+
+  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_);
+  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_);
+  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_);
+  ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_);
+  ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_);
+  ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_);
+  ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_);
+  ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_);
+  ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_);
+
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-	      << cfl->UnusedValues();
+              << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+
+  Check();
 }
 
-// propagation function
+// Method to convert from a matrix representing a minibatch of vectorized
+// 3D tensors to patches for 3d max pooling, each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InputToInputPatches(
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
 
-/* Convolutional propagation is explained:
- - Recall the AffineComponent, input X is defined #frames x $input-dim,
-   linear matrix A is defined $output-dim x $input-dim, and bias
-   vector B is defined by length $output-dim. The propagation is
-   Y = X * A' + B                                     (1)
-   where "*" is row-by-row processing of X, executing vector-matrix
-   multiplication
-   Y(t) = X(t) * A' + B                               (2)
-   which converts each row of input of dim $input-dim to a row of output of
-   dim $output-dim by A' (' defines transpose).
- - In Convolution1dComponent, A is redefined $num-filters x $filter-dim,
-   and bias vector B is redefined by length $num-filters. The propatation is
-   Y = X o A' + B                                     (3)
-   where "o" is also row-by-row processing of X, but executing vector-matrix
-   convolution, which consists of a group of vector-vector convolutions.
-   For instance, the convolution of X(t) and the i-th filter A(i) is
-   Y(t,i) = X(t) o A'(i) + B(i)                       (4)
-   The convolution used here is valid convolution. Meaning that the
-   output of M o N is of dim |M| - |N| + 1, assuming M is not shorter then N.
-
-   Note that in all the equations, B is extended to proper dimensions
-   for legal addition.
-*/
-void Convolutional1dComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                         const CuMatrixBase<BaseFloat> &in,
-                                         CuMatrixBase<BaseFloat> *out) const {
-  // dims
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_frames = in.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
-  // column_map is indexed by the column-index of "patches",
-  // and the value is the corresponding column-index of "in".
-  std::vector<int32> column_map(filter_dim * num_patches);
-
-  // build-up a column selection map
-  for (int32 p = 0, index = 0; p < num_patches; p++) {
-    for (int32 s = 0; s < num_splice; s++) {
-        for (int32 d = 0; d < patch_dim_; d++, index++) {
-        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+  std::vector<int32> column_map(patches->NumCols());
+  int32 column_map_size = column_map.size();
+  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+        // given the local node coordinate, group them from each pool
+        // to form a patch 
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              KALDI_ASSERT(index < column_map_size);
+              column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+            }
+          }
+        }
       }
     }
   }
   CuArray<int32> cu_cols(column_map);
-  patches.CopyCols(in, cu_cols);
-
-  //
-  // compute filter activations
-  //
-
-  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch, filter_params_batch;
-
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-		  filter_params_, 0, filter_params_.NumRows(), 0,
-		  filter_params_.NumCols());
-
-  // form batch in vector container
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container. for filter_params_batch, all elements
-    // point to the same copy filter_params_elem
-    tgt_batch.push_back(new CuSubMatrix<BaseFloat>(out->ColRange(p * num_filters,
-				    num_filters)));
-    patch_batch.push_back(new CuSubMatrix<BaseFloat>(patches.ColRange(p * filter_dim,
-				    filter_dim)));
-    filter_params_batch.push_back(filter_params_elem);
-
-    tgt_batch[p]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
-  }
-
-  // apply all filters
-  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch, kNoTrans,
-                              filter_params_batch, kTrans, 1.0);
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < num_patches; p++) {
-    delete tgt_batch[p];
-    delete patch_batch[p];
-  }
-}
-
-// scale the parameters
-void Convolutional1dComponent::Scale(BaseFloat scale) {
-  filter_params_.Scale(scale);
-  bias_params_.Scale(scale);
-}
-
-// add another convolution component
-void Convolutional1dComponent::Add(BaseFloat alpha, const Component &other_in) {
-  const Convolutional1dComponent *other =
-      dynamic_cast<const Convolutional1dComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  filter_params_.AddMat(alpha, other->filter_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
+  patches->CopyCols(in, cu_cols);
 }
 
 /*
- This function does an operation similar to reversing a map,
- except it handles maps that are not one-to-one by outputting
- the reversed map as a vector of lists.
- @param[in] forward_indexes is a vector of int32, each of whose
-            elements is between 0 and input_dim - 1.
- @param[in] input_dim. See definitions of forward_indexes and
-            backward_indexes.
- @param[out] backward_indexes is a vector of dimension input_dim
-            of lists, The list at (backward_indexes[i]) is a list
-            of all indexes j such that forward_indexes[j] = i.
+  This is the 3d max pooling propagate function.
+  It is assumed that each row of the input matrix
+  is a vectorized 3D-tensor of type zxy.
+  Similar to the propagate function of ConvolutionComponent,
+  the input matrix is first arranged into patches so that
+  pools (with / without overlapping) could be
+  processed in a parallelizable manner.
+  The output matrix is also a vectorized 3D-tensor of type zxy.
 */
-void Convolutional1dComponent::ReverseIndexes(const std::vector<int32> &forward_indexes,
-                                              int32 input_dim,
-                                              std::vector<std::vector<int32> > *backward_indexes) {
-  int32 i, size = forward_indexes.size();
-  int32 reserve_size = 2 + size / input_dim;
-  backward_indexes->resize(input_dim);
-  std::vector<std::vector<int32> >::iterator iter = backward_indexes->begin(),
-    end = backward_indexes->end();
-  for (; iter != end; ++iter)
-    iter->reserve(reserve_size);
-  for (int32 j = 0; j < forward_indexes.size(); j++) {
-    i = forward_indexes[j];
-    KALDI_ASSERT(i < input_dim);
-    (*backward_indexes)[i].push_back(j);
-  }
-}
 
-/*
- This function transforms a vector of lists into a list of vectors,
- padded with -1.
- @param[in] The input vector of lists. Let in.size() be D, and let
-            the longest list length (i.e. the max of in[i].size()) be L.
- @param[out] The output list of vectors. The length of the list will
-            be L, each vector-dimension will be D (i.e. out[i].size() == D),
-            and if in[i] == j, then for some k we will have that
-            out[k][j] = i. The output vectors are padded with -1
-            where necessary if not all the input lists have the same side.
-*/
-void Convolutional1dComponent::RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                                                std::vector<std::vector<int32> > *out) {
-  int32 D = in.size();
-  int32 L = 0;
-  for (int32 i = 0; i < D; i++)
-    if (in[i].size() > L)
-      L = in[i].size();
-  out->resize(L);
-  for (int32 i = 0; i < L; i++)
-    (*out)[i].resize(D, -1);
-  for (int32 i = 0; i < D; i++) {
-    for (int32 j = 0; j < in[i].size(); j++) {
-      (*out)[j][i] = in[i][j];
-    }
-  }
-}
-
-
-// back propagation function
-void Convolutional1dComponent::Backprop(const std::string &debug_info,
-                                        const ComponentPrecomputedIndexes *indexes,
-                                        const CuMatrixBase<BaseFloat> &in_value,
-                                        const CuMatrixBase<BaseFloat> &, // out_value,
-                                        const CuMatrixBase<BaseFloat> &out_deriv,
-                                        Component *to_update_in,
-                                        CuMatrixBase<BaseFloat> *in_deriv) const {
-  Convolutional1dComponent *to_update = dynamic_cast<Convolutional1dComponent*>(to_update_in);
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_frames = out_deriv.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-
-  /** Buffer for backpropagation:
-   *  derivatives in the domain of 'patches_',
-   *  1row = vectorized rectangular feature patches,
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches_deriv(num_frames, filter_dim * num_patches, kSetZero);
-
-  //
-  // backpropagate to vector of matrices
-  // (corresponding to position of a filter)
-  //
-  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-	  filter_params_batch;
-
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-		  filter_params_, 0, filter_params_.NumRows(), 0,
-		  filter_params_.NumCols());
-
-  // form batch in vector container
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container. for filter_params_batch, all elements
-    // point to the same copy filter_params_elem
-    patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(patches_deriv.ColRange(
-				    p * filter_dim, filter_dim)));
-    out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-				    p * num_filters, num_filters)));
-    filter_params_batch.push_back(filter_params_elem);
-  }
-  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch, out_deriv_batch, kNoTrans,
-                              filter_params_batch, kNoTrans, 0.0);
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < num_patches; p++) {
-    delete patch_deriv_batch[p];
-    delete out_deriv_batch[p];
-  }
-
-  // sum the derivatives into in_deriv
-  std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 p = 0, index = 0; p < num_patches; p++) {
-    for (int32 s = 0; s < num_splice; s++) {
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
-      }
-    }
-  }
-
-  if (in_deriv) {
-    std::vector<std::vector<int32> > reversed_column_map;
-    ReverseIndexes(column_map, InputDim(), &reversed_column_map);
-    std::vector<std::vector<int32> > rearranged_column_map;
-    RearrangeIndexes(reversed_column_map, &rearranged_column_map);
-    for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-      CuArray<int32> cu_cols(rearranged_column_map[p]);
-      in_deriv->AddCols(patches_deriv, cu_cols);
-    }
-  }
-
-  if (to_update != NULL) {
-    // Next update the model (must do this 2nd so the derivatives we propagate
-    // are accurate, in case this == to_update_in.)
-    to_update->Update(debug_info, in_value, out_deriv);
-  }
-}
-
-void Convolutional1dComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    learning_rate_ = 1.0;  // don't call SetLearningRate, that would apply the
-                           // learning rate factor.
-    is_gradient_ = true;
-  }
-  filter_params_.SetZero();
-  bias_params_.SetZero();
-}
-
-void Convolutional1dComponent::Read(std::istream &is, bool binary) {
-  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
-  ExpectToken(is, binary, "<PatchDim>");
-  ReadBasicType(is, binary, &patch_dim_);
-  ExpectToken(is, binary, "<PatchStep>");
-  ReadBasicType(is, binary, &patch_step_);
-  ExpectToken(is, binary, "<PatchStride>");
-  ReadBasicType(is, binary, &patch_stride_);
-  ExpectToken(is, binary, "<FilterParams>");
-  filter_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<IsGradient>") {
-    ReadBasicType(is, binary, &is_gradient_);
-    ExpectToken(is, binary, "</Convolutional1dComponent>");
-  } else {
-    is_gradient_ = false;
-    KALDI_ASSERT(tok == "</Convolutional1dComponent>");
-  }
-}
-
-void Convolutional1dComponent::Write(std::ostream &os, bool binary) const {
-  WriteUpdatableCommon(os, binary);  // Write opening tag and learning rate
-  WriteToken(os, binary, "<PatchDim>");
-  WriteBasicType(os, binary, patch_dim_);
-  WriteToken(os, binary, "<PatchStep>");
-  WriteBasicType(os, binary, patch_step_);
-  WriteToken(os, binary, "<PatchStride>");
-  WriteBasicType(os, binary, patch_stride_);
-  WriteToken(os, binary, "<FilterParams>");
-  filter_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, "</Convolutional1dComponent>");
-}
-
-BaseFloat Convolutional1dComponent::DotProduct(const UpdatableComponent &other_in) const {
-  const Convolutional1dComponent *other =
-      dynamic_cast<const Convolutional1dComponent*>(&other_in);
-  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
-         + VecVec(bias_params_, other->bias_params_);
-}
-
-Component* Convolutional1dComponent::Copy() const {
-  Convolutional1dComponent *ans = new Convolutional1dComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->patch_dim_ = patch_dim_;
-  ans->patch_step_ = patch_step_;
-  ans->patch_stride_ = patch_stride_;
-  ans->filter_params_ = filter_params_;
-  ans->bias_params_ = bias_params_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-void Convolutional1dComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
-  temp_filter_params.SetRandn();
-  filter_params_.AddMat(stddev, temp_filter_params);
+void MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in,
+                                    CuMatrixBase<BaseFloat> *out) const {
+  int32 num_frames = in.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in, &patches);
 
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
+  out->Set(-1e20); // reset a large negative value
+  for (int32 q = 0; q < pool_size; q++)
+    out->Max(patches.ColRange(q * num_pools, num_pools));
 }
 
-void Convolutional1dComponent::SetParams(const VectorBase<BaseFloat> &bias,
-                                         const MatrixBase<BaseFloat> &filter) {
-  bias_params_ = bias;
-  filter_params_ = filter;
-  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
-}
+// Method to compute the input derivative matrix from the input derivatives
+// for patches, where each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InderivPatchesToInderiv(
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
 
-int32 Convolutional1dComponent::NumParameters() const {
-  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
-}
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
 
-// update parameters
-void Convolutional1dComponent::Update(const std::string &debug_info,
-		                      const CuMatrixBase<BaseFloat> &in_value,
-                                      const CuMatrixBase<BaseFloat> &out_deriv) {
-  // useful dims
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-  int32 num_frames = in_value.NumRows();
-  int32 num_splice = InputDim() / patch_stride_;
-  CuMatrix<BaseFloat> filters_grad;
-  CuVector<BaseFloat> bias_grad;
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
-  std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 p = 0, index = 0; p < num_patches; p++) {
-    for (int32 s = 0; s < num_splice; s++) {
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
+  int32 rev_col_map_size = reverse_column_map.size();
+  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+              KALDI_ASSERT(vector_index < rev_col_map_size);
+              reverse_column_map[vector_index].push_back(index);
+            }
+          }
+        }
       }
     }
   }
-  CuArray<int32> cu_cols(column_map);
-  patches.CopyCols(in_value, cu_cols);
 
-  //
-  // calculate the gradient
-  //
-  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
-  bias_grad.Resize(num_filters, kSetZero); // reset
-
-  //
-  // use all the patches
-  //
-
-  // create a single large matrix holding the smaller matrices
-  // from the vector container filters_grad_batch along the rows
-  CuMatrix<BaseFloat> filters_grad_blocks_batch(
-		  num_patches * filters_grad.NumRows(), filters_grad.NumCols());
-
-  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, diff_patch_batch,
-	  patch_batch;
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container
-    filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-			    filters_grad_blocks_batch.RowRange(
-				    p * filters_grad.NumRows(),
-				    filters_grad.NumRows())));
-    diff_patch_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-				    p * num_filters, num_filters)));
-    patch_batch.push_back(new CuSubMatrix<BaseFloat>(patches.ColRange(
-				    p * filter_dim, filter_dim)));
-  }
-
-  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, diff_patch_batch, kTrans, patch_batch,
-                              kNoTrans, 1.0);
-
-  // add the row blocks together to filters_grad
-  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
-
-  // create a matrix holding the col blocks sum of out_deriv
-  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(), num_filters);
-
-  // add the col blocks together to out_deriv_col_blocks_sum
-  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
-
-  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
-
-  // release memory
-  for (int32 p = 0; p < num_patches; p++) {
-    delete filters_grad_batch[p];
-    delete diff_patch_batch[p];
-    delete patch_batch[p];
-  }
-
-  //
-  // update
-  //
-  filter_params_.AddMat(learning_rate_, filters_grad);
-  bias_params_.AddVec(learning_rate_, bias_grad);
-}
-
-void Convolutional1dComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  KALDI_ASSERT(params->Dim() == this->NumParameters());
-  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
-  params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_);
-  params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_);
-}
-void Convolutional1dComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
-  KALDI_ASSERT(params.Dim() == this->NumParameters());
-  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
-  filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params));
-  bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
-}
-
-void MaxpoolingComponent::Init(int32 input_dim, int32 output_dim,
-                               int32 pool_size, int32 pool_stride)  {
-  input_dim_ = input_dim;
-  output_dim_ = output_dim;
-  pool_size_ = pool_size;
-  pool_stride_ = pool_stride;
-
-  // sanity check
-  // number of patches
-  KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
-  int32 num_patches = input_dim_ / pool_stride_;
-  // number of pools
-  KALDI_ASSERT(num_patches % pool_size_ == 0);
-  int32 num_pools = num_patches / pool_size_;
-  // check output dim
-  KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
-}
-
-void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
-  int32 input_dim = 0;
-  int32 output_dim = 0;
-  int32 pool_size = -1, pool_stride = -1;
-  bool ok = true;
-
-  ok = ok && cfl->GetValue("input-dim", &input_dim);
-  ok = ok && cfl->GetValue("output-dim", &output_dim);
-  ok = ok && cfl->GetValue("pool-size", &pool_size);
-  ok = ok && cfl->GetValue("pool-stride", &pool_stride);
-
-  KALDI_LOG << output_dim << " " << input_dim << " " << ok;
-  KALDI_LOG << "Pool: " << pool_size << " "
-            << pool_stride << " " << ok;
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-	      << cfl->UnusedValues();
-  if (!ok || output_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(input_dim, output_dim, pool_size, pool_stride);
-}
-
-void MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const {
-  int32 num_patches = input_dim_ / pool_stride_;
-  int32 num_pools = num_patches / pool_size_;
-
-  // do the max-pooling
-  for (int32 q = 0; q < num_pools; q++) {
-    // get output buffer of the pool
-    CuSubMatrix<BaseFloat> pool(out->ColRange(q * pool_stride_, pool_stride_));
-    pool.Set(-1e20); // reset a large negative value
-    for (int32 r = 0; r < pool_size_; r++) {
-      // col-by-col block comparison pool
-      int32 p = r + q * pool_size_;
-      pool.Max(in.ColRange(p * pool_stride_, pool_stride_));
-    }
+  std::vector<std::vector<int32> > rearranged_column_map;
+  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
+  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
+    CuArray<int32> cu_cols(rearranged_column_map[p]);
+    in_deriv->AddCols(in_deriv_patches, cu_cols);
   }
 }
 
+/*
+  3d max pooling backpropagate function
+  This function backpropagate the error from
+  out_deriv to in_deriv.
+  In order to select the node in each pool to
+  backpropagate the error, it has to compare
+  the output pool value stored in the out_value
+  matrix with each of its input pool member node
+  stroed in the in_value matrix.
+*/
 void MaxpoolingComponent::Backprop(const std::string &debug_info,
                                    const ComponentPrecomputedIndexes *indexes,
                                    const CuMatrixBase<BaseFloat> &in_value,
@@ -4198,66 +3729,87 @@ void MaxpoolingComponent::Backprop(const std::string &debug_info,
                                    const CuMatrixBase<BaseFloat> &out_deriv,
                                    Component *, // to_update,
                                    CuMatrixBase<BaseFloat> *in_deriv) const {
-  int32 num_patches = input_dim_ / pool_stride_;
-  int32 num_pools = num_patches / pool_size_;
-  std::vector<int32> patch_summands(num_patches, 0);
-
-  for(int32 q = 0; q < num_pools; q++) {
-    for(int32 r = 0; r < pool_size_; r++) {
-      int32 p = r + q * pool_size_;
-      CuSubMatrix<BaseFloat> in_p(in_value.ColRange(p * pool_stride_, pool_stride_));
-      CuSubMatrix<BaseFloat> out_q(out_value.ColRange(q * pool_stride_, pool_stride_));
-      CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
-      CuMatrix<BaseFloat> src(out_deriv.ColRange(q * pool_stride_, pool_stride_));
-      // zero-out mask
-      CuMatrix<BaseFloat> mask;
-      in_p.EqualElementMask(out_q, &mask);
-      src.MulElements(mask);
-      tgt.AddMat(1.0, src);
-      // summed deriv info
-      patch_summands[p] += 1;
-    }
-  }
+  if (!in_deriv)
+    return;
 
-  // scale in_deriv of overlaped pools
-  for(int32 p = 0; p < num_patches; p++) {
-    CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
-    KALDI_ASSERT(patch_summands[p] > 0);
-    tgt.Scale(1.0 / patch_summands[p]);
+  int32 num_frames = in_value.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in_value, &patches);
+
+  for (int32 q = 0; q < pool_size; q++) {
+    // zero-out mask
+    CuMatrix<BaseFloat> mask;
+    out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask);
+    mask.MulElements(out_deriv);
+    patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
   }
+
+  // combine the derivatives from the individual input deriv patches
+  // to compute input deriv matrix
+  InderivPatchesToInderiv(patches, in_deriv);
 }
 
 void MaxpoolingComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "<PoolSize>");
-  ReadBasicType(is, binary, &pool_size_);
-  ExpectToken(is, binary, "<PoolStride>");
-  ReadBasicType(is, binary, &pool_stride_);
+  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputXDim>");
+  ReadBasicType(is, binary, &input_x_dim_);
+  ExpectToken(is, binary, "<InputYDim>");
+  ReadBasicType(is, binary, &input_y_dim_);
+  ExpectToken(is, binary, "<InputZDim>");
+  ReadBasicType(is, binary, &input_z_dim_);
+  ExpectToken(is, binary, "<PoolXSize>");
+  ReadBasicType(is, binary, &pool_x_size_);
+  ExpectToken(is, binary, "<PoolYSize>");
+  ReadBasicType(is, binary, &pool_y_size_);
+  ExpectToken(is, binary, "<PoolZSize>");
+  ReadBasicType(is, binary, &pool_z_size_);
+  ExpectToken(is, binary, "<PoolXStep>");
+  ReadBasicType(is, binary, &pool_x_step_);
+  ExpectToken(is, binary, "<PoolYStep>");
+  ReadBasicType(is, binary, &pool_y_step_);
+  ExpectToken(is, binary, "<PoolZStep>");
+  ReadBasicType(is, binary, &pool_z_step_);
   ExpectToken(is, binary, "</MaxpoolingComponent>");
+  Check();
 }
 
 void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "<MaxpoolingComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "<PoolSize>");
-  WriteBasicType(os, binary, pool_size_);
-  WriteToken(os, binary, "<PoolStride>");
-  WriteBasicType(os, binary, pool_stride_);
+  WriteToken(os, binary, "<InputXDim>");
+  WriteBasicType(os, binary, input_x_dim_);
+  WriteToken(os, binary, "<InputYDim>");
+  WriteBasicType(os, binary, input_y_dim_);
+  WriteToken(os, binary, "<InputZDim>");
+  WriteBasicType(os, binary, input_z_dim_);
+  WriteToken(os, binary, "<PoolXSize>");
+  WriteBasicType(os, binary, pool_x_size_);
+  WriteToken(os, binary, "<PoolYSize>");
+  WriteBasicType(os, binary, pool_y_size_);
+  WriteToken(os, binary, "<PoolZSize>");
+  WriteBasicType(os, binary, pool_z_size_);
+  WriteToken(os, binary, "<PoolXStep>");
+  WriteBasicType(os, binary, pool_x_step_);
+  WriteToken(os, binary, "<PoolYStep>");
+  WriteBasicType(os, binary, pool_y_step_);
+  WriteToken(os, binary, "<PoolZStep>");
+  WriteBasicType(os, binary, pool_z_step_);
   WriteToken(os, binary, "</MaxpoolingComponent>");
 }
 
+// display information about component
 std::string MaxpoolingComponent::Info() const {
   std::ostringstream stream;
-  stream << Type() << ", input-dim = " << input_dim_
-         << ", output-dim = " << output_dim_
-         << ", pool-size = " << pool_size_
-         << ", pool-stride = " << pool_stride_;
+  stream << Type()
+         << ", input-x-dim = " << input_x_dim_
+         << ", input-y-dim = " << input_y_dim_
+         << ", input-z-dim = " << input_z_dim_
+         << ", pool-x-size = " << pool_x_size_
+         << ", pool-y-size = " << pool_y_size_
+         << ", pool-z-size = " << pool_z_size_
+         << ", pool-x-step = " << pool_x_step_
+         << ", pool-y-step = " << pool_y_step_
+         << ", pool-z-step = " << pool_z_step_;
   return stream.str();
 }
 
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index a78f72c0afb..bb15afbcc39 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -6,6 +6,7 @@
 //           2014-2015  Vijayaditya Peddinti
 //           2014-2015  Guoguo Chen
 //                2015  Daniel Galvez
+//                2015  Tom Ko
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -1445,161 +1446,65 @@ class ConvolutionComponent: public UpdatableComponent {
 };
 
 /**
- * Convolutional1dComponent implements convolution over frequency axis.
- * We assume the input featrues are spliced, i.e. each frame is in
- * fact a set of stacked frames, where we can form patches which span
- * over several frequency bands and whole time axis. A patch is the
- * instance of a filter on a group of frequency bands and whole time
- * axis. Shifts of the filter generate patches.
+ * MaxPoolingComponent :
+ * Maxpooling component was firstly used in ConvNet for selecting an 
+ * representative activation in an area. It inspired Maxout nonlinearity.
+ * Each output element of this component is the maximum of a block of 
+ * input elements where the block has a 3D dimension (pool_x_size_,
+ * pool_y_size_, pool_z_size_).
+ * Blocks could overlap if the shift value on any axis is smaller
+ * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
+ * If the shift values are euqal to their pool size, there is no 
+ * overlap; while if they all equal 1, the blocks overlap to 
+ * the greatest possible extent.
  *
- * The convolution is done over whole axis with same filter
- * coefficients, i.e. we don't use separate filters for different
- * 'regions' of frequency axis. Due to convolution, same weights are
- * used repeateadly, the final gradient is a sum of all
- * position-specific gradients (the sum was found better than
- * averaging).
+ * This component is designed to be used after a ConvolutionComponent
+ * so that the input matrix is propagated from a 2d-convolutional layer.
+ * This component implements 3d-maxpooling which performs
+ * max pooling along the three axes.
+ * Input : A matrix where each row is a vectorized 3D-tensor.
+ *        The 3D tensor has dimensions
+ *        x: (e.g. time)
+ *        y: (e.g. frequency)
+ *        z: (e.g. channels like number of filters in the ConvolutionComponent)
  *
- * In order to have a fast implementations, the filters are
- * represented in vectorized form, where each rectangular filter
- * corresponds to a row in a matrix, where all the filters are
- * stored. The features are then re-shaped to a set of matrices, where
- * one matrix corresponds to single patch-position, where all the
- * filters get applied.
+ *        The component assumes input vectorizations of type zyx
+ *        which is the default output vectorization type of a ConvolutionComponent.
+ *        e.g. for input vectorization of type zyx the input is vectorized by
+ *        spanning axes z, y and x of the tensor in that order.
+ *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
+ *        the zyx vectorized input looks like
+ *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
+ *
+ * Output : The output is also a 3D tensor vectorized in the zyx format.
+ *
+ * For information on the hyperparameters and parameters of this component see
+ * the variable declarations.
  *
- * The type of convolution is controled by hyperparameters:
- * patch_dim_     ... frequency axis size of the patch
- * patch_step_    ... size of shift in the convolution
- * patch_stride_  ... shift for 2nd dim of a patch
- *                    (i.e. frame length before splicing)
- * For instance, for a convolutional component after raw input,
- * if the input is 36-dim fbank feature with delta of order 2
- * and spliced using +/- 5 frames of contexts, the convolutional
- * component takes the input as a 36 x 33 image. The patch_stride_
- * should be configured 36. If patch_step_ and patch_dim_ are
- * configured 1 and 7, the Convolutional1dComponent creates a
- * 2D filter of 7 x 33, such that the convolution is actually done
- * only along the frequency axis. Specifically, the convolutional
- * output along the frequency axis is (36 - 7) / 1 + 1 = 30, and
- * the convolutional output along the temporal axis is 33 - 33 + 1 = 1,
- * resulting in an output image of 30 x 1, which is called a feature map
- * in ConvNet. Then if the output-dim is set 3840, the constructor
- * would know there should be 3840 / 30 = 128 distinct filters,
- * which will create 128 feature maps of 30 x 1 for one frame of
- * input. The feature maps are vectorized as a 3840-dim row vector
- * in the output matrix of this component. For details on progatation
- * of Convolutional1dComponent, check the function definition.
  *
  */
-class Convolutional1dComponent: public UpdatableComponent {
+
+class MaxpoolingComponent: public Component {
  public:
-  Convolutional1dComponent();
+
+  MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+                           pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
+                           pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
   // constructor using another component
-  Convolutional1dComponent(const Convolutional1dComponent &component);
-  // constructor using parameters
-  Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
-                           const CuVectorBase<BaseFloat> &bias_params,
-                           BaseFloat learning_rate);
+  MaxpoolingComponent(const MaxpoolingComponent &component): 
+             input_x_dim_(component.input_x_dim_),
+             input_y_dim_(component.input_y_dim_),
+             input_z_dim_(component.input_z_dim_),
+             pool_x_size_(component.pool_x_size_), 
+             pool_y_size_(component.pool_y_size_),
+             pool_z_size_(component.pool_z_size_),
+             pool_x_step_(component.pool_x_step_),
+             pool_y_step_(component.pool_y_step_),
+             pool_z_step_(component.pool_z_step_) { }
 
   virtual int32 InputDim() const;
   virtual int32 OutputDim() const;
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual std::string Type() const { return "Convolutional1dComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
-	    kBackpropAdds|kPropagateAdds;
-  }
-
-  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update_in,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual Component* Copy() const;
-
-  // Some functions from base-class UpdatableComponent.
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual int32 NumParameters() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  // Some functions that are specific to this class.
-  void SetParams(const VectorBase<BaseFloat> &bias,
-                 const MatrixBase<BaseFloat> &filter);
-  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
-  const CuMatrix<BaseFloat> &LinearParams() { return filter_params_; }
-  void Init(int32 input_dim, int32 output_dim,
-            int32 patch_dim, int32 patch_step, int32 patch_stride,
-            BaseFloat param_stddev, BaseFloat bias_stddev);
-  void Init(int32 patch_dim, int32 patch_step, int32 patch_stride,
-            std::string matrix_filename);
-
-  // resize the component, setting the parameters to zero, while
-  // leaving any other configuration values the same
-  void Resize(int32 input_dim, int32 output_dim);
-
-  void Update(const std::string &debug_info,
-	      const CuMatrixBase<BaseFloat> &in_value,
-              const CuMatrixBase<BaseFloat> &out_deriv);
-
- private:
-  int32 patch_dim_;
-  int32 patch_step_;
-  int32 patch_stride_;
-
-  static void ReverseIndexes(const std::vector<int32> &forward_indexes,
-                             int32 input_dim,
-                             std::vector<std::vector<int32> > *backward_indexes);
-  static void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                               std::vector<std::vector<int32> > *out);
-
-  const Convolutional1dComponent &operator = (const Convolutional1dComponent &other); // Disallow.
-  CuMatrix<BaseFloat> filter_params_;
-  CuVector<BaseFloat> bias_params_;
-  bool is_gradient_;
-};
-
-/**
- * MaxPoolingComponent :
- * Maxpooling component was firstly used in ConvNet for selecting an representative
- * activation in an area. It inspired Maxout nonlinearity.
- *
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * For instance, a minibatch of 512 frames is propagated by a convolutional
- * layer, resulting in a 512 x 3840 input matrix for MaxpoolingComponent,
- * which is composed of 128 feature maps for each frame (128 x 30). If you want
- * a 3-to-1 maxpooling on each feature map, set 'pool_stride_' and 'pool_size_'
- * as 128 and 3 respectively. Maxpooling component would create an output
- * matrix of 512 x 1280. The 30 input neurons are grouped by a group size of 3, and
- * the maximum in a group is selected, creating a smaller feature map of 10.
- *
- * Our pooling does not supports overlaps, which simplifies the
- * implementation (and was not helpful for Ossama).
- */
-class MaxpoolingComponent: public Component {
- public:
-  explicit MaxpoolingComponent(int32 input_dim, int32 output_dim,
-                               int32 pool_size, int32 pool_stride) {
-    Init(input_dim, output_dim, pool_size, pool_stride);
-  }
-  MaxpoolingComponent(): input_dim_(0), output_dim_(0),
-    pool_size_(0), pool_stride_(0) { }
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
+  virtual void Check() const;
 
   virtual std::string Info() const;
   virtual void InitFromConfig(ConfigLine *cfl);
@@ -1625,23 +1530,35 @@ class MaxpoolingComponent: public Component {
 
   /// Write component to stream
   virtual void Write(std::ostream &os, bool binary) const;
-  // We don't implement InitFromConfig() at this level: child-class should do
-  // it.
-  virtual Component* Copy() const {
-    return new MaxpoolingComponent(input_dim_, output_dim_,
-		    pool_size_, pool_stride_); }
+  virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
 
-  // Some functions that are specific to this class
-  void Init(int32 input_dim, int32 output_dim,
-            int32 pool_size, int32 pool_stride);
+  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
+                           CuMatrix<BaseFloat> *patches) const;
+  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
+                               CuMatrixBase<BaseFloat> *in_deriv) const;
 
  protected:
-  int32 input_dim_;
-  int32 output_dim_;
-  int32 pool_size_;
-  int32 pool_stride_;
+  int32 input_x_dim_;   // size of the input along x-axis
+                        // (e.g. number of time steps)
+  int32 input_y_dim_;   // size of input along y-axis
+                        // (e.g. number of mel-frequency bins)
+  int32 input_z_dim_;   // size of input along z-axis
+                        // (e.g. number of filters in the ConvolutionComponent)
+
+  int32 pool_x_size_;    // size of the pooling window along x-axis
+  int32 pool_y_size_;    // size of the pooling window along y-axis
+  int32 pool_z_size_;    // size of the pooling window along z-axis
+
+  int32 pool_x_step_;   // the number of steps taken along x-axis of input
+                        //  before computing the next pool
+  int32 pool_y_step_;   // the number of steps taken along y-axis of input
+                        // before computing the next pool
+  int32 pool_z_step_;   // the number of steps taken along z-axis of input
+                        // before computing the next pool
+
 };
 
+
 /**
    CompositeComponent is components representing a sequence of
    [simple] components.  The config line would be something like the following
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 97720a0f2c0..cce697fa355 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -614,38 +614,6 @@ void GenerateConfigSequenceCnn(
     std::vector<std::string> *configs) {
   std::ostringstream os;
 
-  int32 pool_stride = 5 + Rand() % 10, pool_size = 2 + Rand() % 3,
-	num_pools = 1 + Rand() % 10;
-  int32 num_patches = num_pools * pool_size;
-  int32 patch_step = 1 + Rand() % 4, patch_dim = 4 + Rand () % 5,
-	patch_stride = (num_patches - 1) * patch_step + patch_dim;
-  int32 num_splice = 5 + Rand() % 10, num_filters = pool_stride;
-
-  int32 input_dim = patch_stride * num_splice,
-	hidden_dim = num_patches * num_filters,
-        output_dim = num_pools * pool_stride;
-
-  os << "component name=conv type=Convolutional1dComponent input-dim="
-     << input_dim << " output-dim=" << hidden_dim
-     << " patch-dim=" << patch_dim << " patch-step=" << patch_step
-     << " patch-stride=" << patch_stride << std::endl;
-  os << "component name=maxpooling type=MaxpoolingComponent input-dim="
-     << hidden_dim << " output-dim=" << output_dim
-     << " pool-size=" << pool_size << " pool-stride=" << pool_stride
-     << std::endl;
-
-  os << "input-node name=input dim=" << input_dim << std::endl;
-  os << "component-node name=conv_node component=conv input=input\n";
-  os << "component-node name=maxpooling_node component=maxpooling input=conv_node\n";
-  os << "output-node name=output input=maxpooling_node\n";
-  configs->push_back(os.str());
-}
-
-void GenerateConfigSequenceCnn2d(
-    const NnetGenerationOptions &opts,
-    std::vector<std::string> *configs) {
-  std::ostringstream os;
-
 
   int32 input_x_dim = 10 + Rand() % 20,
         input_y_dim = 10 + Rand() % 20,
@@ -682,8 +650,40 @@ void GenerateConfigSequenceCnn2d(
      << " input-vectorization-order=" << vectorization
      << std::endl;
 
+  int32 conv_output_x_dim = (1 + (input_x_dim - filt_x_dim) / filt_x_step);
+  int32 conv_output_y_dim = (1 + (input_y_dim - filt_y_dim) / filt_y_step);
+  int32 conv_output_z_dim = num_filters;
+  int32 pool_x_size = 1 + Rand() % conv_output_x_dim;
+  int32 pool_y_size = 1 + Rand() % conv_output_y_dim;
+  int32 pool_z_size = 1 + Rand() % conv_output_z_dim;
+  int32 pool_x_step = 1;
+  int32 pool_y_step = 1;
+  int32 pool_z_step = 1;
+  do {
+    pool_x_step = (1 + Rand() % pool_x_size);
+  } while((conv_output_x_dim - pool_x_size) % pool_x_step);
+  do {
+    pool_y_step = (1 + Rand() % pool_y_size);
+  } while((conv_output_y_dim - pool_y_size) % pool_y_step);
+  do {
+    pool_z_step = (1 + Rand() % pool_z_size);
+  } while((conv_output_z_dim - pool_z_size) % pool_z_step);
+
+  os << "component name=maxpooling type=MaxpoolingComponent "
+     << " input-x-dim=" << conv_output_x_dim
+     << " input-y-dim=" << conv_output_y_dim
+     << " input-z-dim=" << conv_output_z_dim
+     << " pool-x-size=" << pool_x_size
+     << " pool-y-size=" << pool_y_size
+     << " pool-z-size=" << pool_z_size
+     << " pool-x-step=" << pool_x_step
+     << " pool-y-step=" << pool_y_step
+     << " pool-z-step=" << pool_z_step
+     << std::endl;
+
   os << "input-node name=input dim=" << (input_x_dim * input_y_dim * input_z_dim) << std::endl;
   os << "component-node name=conv_node component=conv input=input\n";
+  os << "component-node name=maxpooling_node component=maxpooling input=conv_node\n";
   os << "output-node name=output input=conv_node\n";
   configs->push_back(os.str());
 }
@@ -718,7 +718,7 @@ void GenerateConfigSequence(
     const NnetGenerationOptions &opts,
     std::vector<std::string> *configs) {
 start:
-  int32 network_type = RandInt(0, 9);
+  int32 network_type = RandInt(0, 8);
   switch(network_type) {
     case 0:
       GenerateConfigSequenceSimplest(opts, configs);
@@ -763,11 +763,6 @@ void GenerateConfigSequence(
       GenerateConfigSequenceCnn(opts, configs);
       break;
     case 8:
-      if (!opts.allow_nonlinearity)
-        goto start;
-      GenerateConfigSequenceCnn2d(opts, configs);
-      break;
-    case 9:
       GenerateConfigSequenceDistribute(opts, configs);
       break;
     default:
@@ -834,7 +829,7 @@ void ComputeExampleComputationRequestSimple(
 
 static void GenerateRandomComponentConfig(std::string *component_type,
                                           std::string *config) {
-  int32 n = RandInt(0, 27);
+  int32 n = RandInt(0, 26);
   BaseFloat learning_rate = 0.001 * RandInt(1, 3);
 
   std::ostringstream os;
@@ -945,25 +940,6 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       break;
     }
     case 17: {
-      *component_type = "Convolutional1dComponent";
-      int32 patch_stride = 10 + Rand() % 50, patch_step = 1 + Rand() % 4,
-	    patch_dim = 4 + Rand () % 5;
-
-      // decrease patch_stride so that
-      // (patch_stride - patch_dim) % patch_step == 0
-      patch_stride = patch_stride - ((patch_stride - patch_dim) % patch_step);
-
-      int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
-      int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10;
-      int32 input_dim = patch_stride * num_splice;
-      int32 output_dim = num_patches * num_filters;
-      os << "input-dim=" << input_dim << " output-dim=" << output_dim
-         << " patch-dim=" << patch_dim << " patch-step=" << patch_step
-         << " patch-stride=" << patch_stride
-         << " learning-rate=" << learning_rate;
-      break;
-    }
-    case 18: {
       int32 input_vectorization = Rand() % 2;
       std::string vectorization;
       if (input_vectorization == 0) {
@@ -1001,19 +977,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       // TODO : add test for file based initialization. But confirm how to write
       // a file which is not going to be overwritten by other components
     }
-    case 19: {
-      *component_type = "MaxpoolingComponent";
-      int32 pool_stride = 5 + Rand() % 10,
-      pool_size = 2 + Rand() % 3,
-      num_pools = 1 + Rand() % 10;
-      int32 output_dim = num_pools * pool_stride;
-      int32 num_patches = num_pools * pool_size;
-      int32 input_dim = pool_stride * num_patches;
-      os << "input-dim=" << input_dim << " output-dim=" << output_dim
-         << " pool-size=" << pool_size << " pool-stride=" << pool_stride;
-      break;
-    }
-    case 20: {
+    case 18: {
       *component_type = "PermuteComponent";
       int32 input_dim = 10 + Rand() % 100;
       std::vector<int32> column_map(input_dim);
@@ -1027,7 +991,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       os << "column-map=" << buffer.str();
       break;
     }
-    case 21: {
+    case 19: {
       *component_type = "PerElementOffsetComponent";
       std::string param_config = RandInt(0, 1)?
                                  " param-mean=0.0 param-stddev=0.0":
@@ -1036,14 +1000,14 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " learning-rate=" << learning_rate << param_config;
       break;
     }
-    case 22: {
+    case 20: {
       *component_type = "SumReduceComponent";
       int32 output_dim = RandInt(1, 50), group_size = RandInt(1, 15),
           input_dim = output_dim * group_size;
       os << "input-dim=" << input_dim << " output-dim=" << output_dim;
       break;
     }
-    case 23: {
+    case 21: {
       *component_type = "CompositeComponent";
       int32 cur_dim = RandInt(20, 30), num_components = RandInt(1, 3),
           max_rows_process = RandInt(1, 30);
@@ -1066,14 +1030,14 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       }
       break;
     }
-    case 24: {
+    case 22: {
       *component_type = "SumGroupComponent";
       int32 num_groups = RandInt(1, 50),
         input_dim = num_groups * RandInt(1, 15);
       os << "input-dim=" << input_dim << " output-dim=" << num_groups;
       break;
     }
-    case 25: {
+    case 23: {
       *component_type = "RepeatedAffineComponent";
       int32 num_repeats = RandInt(1, 50),
           input_dim = num_repeats * RandInt(1, 15),
@@ -1082,7 +1046,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " num-repeats=" << num_repeats;
       break;
     }
-    case 26: {
+    case 24: {
       *component_type = "BlockAffineComponent";
       int32 num_blocks = RandInt(1, 50),
           input_dim = num_blocks * RandInt(1, 15),
@@ -1091,7 +1055,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " num-blocks=" << num_blocks;
       break;
     }
-    case 27: {
+    case 25: {
       *component_type = "NaturalGradientRepeatedAffineComponent";
       int32 num_repeats = RandInt(1, 50),
           input_dim = num_repeats * RandInt(1, 15),
@@ -1100,6 +1064,35 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " num-repeats=" << num_repeats;
       break;
     }
+    case 26: {
+      *component_type = "MaxpoolingComponent";
+      int32 input_x_dim = 5 + Rand() % 10,
+            input_y_dim = 5 + Rand() % 10,
+            input_z_dim = 5 + Rand() % 10;
+      int32 pool_x_size = 1 + Rand() % input_x_dim,
+            pool_y_size = 1 + Rand() % input_y_dim,
+            pool_z_size = 1 + Rand() % input_z_dim;
+      int32 pool_x_step = (1 + Rand() % pool_x_size),
+            pool_y_step = (1 + Rand() % pool_y_size),
+            pool_z_step = (1 + Rand() % pool_z_size);
+      // adjusting input dim to ensure divisibility
+      int32 remainder = (input_x_dim - pool_x_size) % pool_x_step;
+      input_x_dim = input_x_dim - remainder;
+      remainder = (input_y_dim - pool_y_size) % pool_y_step;
+      input_y_dim = input_y_dim - remainder;
+      remainder = (input_z_dim - pool_z_size) % pool_z_step;
+      input_z_dim = input_z_dim - remainder;
+      os << " input-x-dim=" << input_x_dim
+         << " input-y-dim=" << input_y_dim
+         << " input-z-dim=" << input_z_dim
+         << " pool-x-size=" << pool_x_size
+         << " pool-y-size=" << pool_y_size
+         << " pool-z-size=" << pool_z_size
+         << " pool-x-step=" << pool_x_step
+         << " pool-y-step=" << pool_y_step
+         << " pool-z-step=" << pool_z_step;
+      break;
+    }
     default:
       KALDI_ERR << "Error generating random component";
   }