diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index d73f86d7542..f7d26b62311 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -90,8 +90,6 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new ClipGradientComponent(); } else if (component_type == "ElementwiseProductComponent") { ans = new ElementwiseProductComponent(); - } else if (component_type == "Convolutional1dComponent") { - ans = new Convolutional1dComponent(); } else if (component_type == "ConvolutionComponent") { ans = new ConvolutionComponent(); } else if (component_type == "MaxpoolingComponent") { diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index c66d0235a93..3d7c56824e1 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -3552,645 +3552,176 @@ void ConvolutionComponent::UnVectorize(const VectorBase ¶ms) { bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim())); } -Convolutional1dComponent::Convolutional1dComponent(): - UpdatableComponent(), - patch_dim_(0), patch_step_(0), patch_stride_(0), is_gradient_(false) {} - -Convolutional1dComponent::Convolutional1dComponent(const Convolutional1dComponent &component): - UpdatableComponent(component), - filter_params_(component.filter_params_), - bias_params_(component.bias_params_), - is_gradient_(component.is_gradient_) {} - -Convolutional1dComponent::Convolutional1dComponent(const CuMatrixBase &filter_params, - const CuVectorBase &bias_params, - BaseFloat learning_rate): - filter_params_(filter_params), - bias_params_(bias_params) { - SetLearningRate(learning_rate); - KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() && - bias_params.Dim() != 0); - is_gradient_ = false; -} - // aquire input dim -int32 Convolutional1dComponent::InputDim() const { - int32 filter_dim = filter_params_.NumCols(); - int32 num_splice = filter_dim / patch_dim_; - return patch_stride_ * num_splice; +int32 MaxpoolingComponent::InputDim() const { + return input_x_dim_ * input_y_dim_ * input_z_dim_; } // aquire output dim -int32 Convolutional1dComponent::OutputDim() const { - int32 num_filters = filter_params_.NumRows(); - int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; - return num_patches * num_filters; -} - -// initialize the component using hyperparameters -void Convolutional1dComponent::Init(int32 input_dim, int32 output_dim, - int32 patch_dim, int32 patch_step, int32 patch_stride, - BaseFloat param_stddev, BaseFloat bias_stddev) { - patch_dim_ = patch_dim; - patch_step_ = patch_step; - patch_stride_ = patch_stride; - int32 num_splice = input_dim / patch_stride; - int32 filter_dim = num_splice * patch_dim; - int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step; - int32 num_filters = output_dim / num_patches; - KALDI_ASSERT(input_dim % patch_stride == 0); - KALDI_ASSERT((patch_stride - patch_dim) % patch_step == 0); - KALDI_ASSERT(output_dim % num_patches == 0); - - filter_params_.Resize(num_filters, filter_dim); - bias_params_.Resize(num_filters); - KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0); - filter_params_.SetRandn(); - filter_params_.Scale(param_stddev); - bias_params_.SetRandn(); - bias_params_.Scale(bias_stddev); -} - -// initialize the component using predefined matrix file -void Convolutional1dComponent::Init(int32 patch_dim, int32 patch_step, int32 patch_stride, - std::string matrix_filename) { - patch_dim_ = patch_dim; - patch_step_ = patch_step; - patch_stride_ = patch_stride; - CuMatrix mat; - ReadKaldiObject(matrix_filename, &mat); - KALDI_ASSERT(mat.NumCols() >= 2); - int32 filter_dim = mat.NumCols() - 1, num_filters = mat.NumRows(); - filter_params_.Resize(num_filters, filter_dim); - bias_params_.Resize(num_filters); - filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim)); - bias_params_.CopyColFromMat(mat, filter_dim); -} - -// resize the component, setting the parameters to zero, while -// leaving any other configuration values the same -void Convolutional1dComponent::Resize(int32 input_dim, int32 output_dim) { - KALDI_ASSERT(input_dim > 0 && output_dim > 0); - int32 num_splice = input_dim / patch_stride_; - int32 filter_dim = num_splice * patch_dim_; - int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; - int32 num_filters = output_dim / num_patches; - KALDI_ASSERT(input_dim % patch_stride_ == 0); - KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0); - KALDI_ASSERT(output_dim % num_patches == 0); - filter_params_.Resize(num_filters, filter_dim); - bias_params_.Resize(num_filters); -} - -// display information about component -std::string Convolutional1dComponent::Info() const { - std::ostringstream stream; - int32 num_splice = InputDim() / patch_stride_; - int32 filter_dim = num_splice * patch_dim_; - int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; - int32 num_filters = OutputDim() / num_patches; - - stream << UpdatableComponent::Info() - << ", num-splice=" << num_splice - << ", num-patches=" << num_patches - << ", num-filters=" << num_filters - << ", filter-dim=" << filter_dim; - PrintParameterStats(stream, "filter-params", filter_params_); - PrintParameterStats(stream, "bias-params", bias_params_, true); - return stream.str(); +int32 MaxpoolingComponent::OutputDim() const { + int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_; + int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_; + int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_; + return num_pools_x * num_pools_y * num_pools_z; +} + +// check the component parameters +void MaxpoolingComponent::Check() const { + // sanity check of the max pooling parameters + KALDI_ASSERT(input_x_dim_ > 0); + KALDI_ASSERT(input_y_dim_ > 0); + KALDI_ASSERT(input_z_dim_ > 0); + KALDI_ASSERT(pool_x_size_ > 0); + KALDI_ASSERT(pool_y_size_ > 0); + KALDI_ASSERT(pool_z_size_ > 0); + KALDI_ASSERT(pool_x_step_ > 0); + KALDI_ASSERT(pool_y_step_ > 0); + KALDI_ASSERT(pool_z_step_ > 0); + KALDI_ASSERT(input_x_dim_ >= pool_x_size_); + KALDI_ASSERT(input_y_dim_ >= pool_y_size_); + KALDI_ASSERT(input_z_dim_ >= pool_z_size_); + KALDI_ASSERT(pool_x_size_ >= pool_x_step_); + KALDI_ASSERT(pool_y_size_ >= pool_y_step_); + KALDI_ASSERT(pool_z_size_ >= pool_z_step_); + KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_ == 0); + KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_ == 0); + KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_ == 0); } // initialize the component using configuration file -void Convolutional1dComponent::InitFromConfig(ConfigLine *cfl) { - KALDI_WARN << "Convolutional1dComponent has been deprecated." - << " Please use ConvolutionComponent."; +void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) { bool ok = true; - std::string matrix_filename; - int32 input_dim = -1, output_dim = -1; - int32 patch_dim = -1, patch_step = -1, patch_stride = -1; - InitLearningRatesFromConfig(cfl); - ok = ok && cfl->GetValue("patch-dim", &patch_dim); - ok = ok && cfl->GetValue("patch-step", &patch_step); - ok = ok && cfl->GetValue("patch-stride", &patch_stride); - if (cfl->GetValue("matrix", &matrix_filename)) { - // initialize from prefined parameter matrix - Init(patch_dim, patch_step, patch_stride, matrix_filename); - if (cfl->GetValue("input-dim", &input_dim)) - KALDI_ASSERT(input_dim == InputDim() && - "input-dim mismatch vs. matrix."); - if (cfl->GetValue("output-dim", &output_dim)) - KALDI_ASSERT(output_dim == OutputDim() && - "output-dim mismatch vs. matrix."); - } else { - ok = ok && cfl->GetValue("input-dim", &input_dim); - ok = ok && cfl->GetValue("output-dim", &output_dim); - // initialize from configuration - BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0; - cfl->GetValue("param-stddev", ¶m_stddev); - cfl->GetValue("bias-stddev", &bias_stddev); - Init(input_dim, output_dim, patch_dim, patch_step, patch_stride, - param_stddev, bias_stddev); - } + + ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_); + ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_); + ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_); + ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_); + ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_); + ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_); + ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_); + ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_); + ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_); + if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " - << cfl->UnusedValues(); + << cfl->UnusedValues(); if (!ok) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); + + Check(); } -// propagation function +// Method to convert from a matrix representing a minibatch of vectorized +// 3D tensors to patches for 3d max pooling, each patch corresponds to +// the nodes having the same local coordinatenodes from each pool +void MaxpoolingComponent::InputToInputPatches( + const CuMatrixBase& in, + CuMatrix *patches) const{ + int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_; + int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_; + int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_; -/* Convolutional propagation is explained: - - Recall the AffineComponent, input X is defined #frames x $input-dim, - linear matrix A is defined $output-dim x $input-dim, and bias - vector B is defined by length $output-dim. The propagation is - Y = X * A' + B (1) - where "*" is row-by-row processing of X, executing vector-matrix - multiplication - Y(t) = X(t) * A' + B (2) - which converts each row of input of dim $input-dim to a row of output of - dim $output-dim by A' (' defines transpose). - - In Convolution1dComponent, A is redefined $num-filters x $filter-dim, - and bias vector B is redefined by length $num-filters. The propatation is - Y = X o A' + B (3) - where "o" is also row-by-row processing of X, but executing vector-matrix - convolution, which consists of a group of vector-vector convolutions. - For instance, the convolution of X(t) and the i-th filter A(i) is - Y(t,i) = X(t) o A'(i) + B(i) (4) - The convolution used here is valid convolution. Meaning that the - output of M o N is of dim |M| - |N| + 1, assuming M is not shorter then N. - - Note that in all the equations, B is extended to proper dimensions - for legal addition. -*/ -void Convolutional1dComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - // dims - int32 num_splice = InputDim() / patch_stride_; - int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; - int32 num_filters = filter_params_.NumRows(); - int32 num_frames = in.NumRows(); - int32 filter_dim = filter_params_.NumCols(); - - /** Buffer of reshaped inputs: - * 1row = vectorized rectangular feature patches - * 1col = dim over speech frames, - */ - CuMatrix patches(num_frames, filter_dim * num_patches, kUndefined); - // column_map is indexed by the column-index of "patches", - // and the value is the corresponding column-index of "in". - std::vector column_map(filter_dim * num_patches); - - // build-up a column selection map - for (int32 p = 0, index = 0; p < num_patches; p++) { - for (int32 s = 0; s < num_splice; s++) { - for (int32 d = 0; d < patch_dim_; d++, index++) { - column_map[index] = p * patch_step_ + s * patch_stride_ + d; + std::vector column_map(patches->NumCols()); + int32 column_map_size = column_map.size(); + for (int32 x = 0, index =0; x < pool_x_size_; x++) { + for (int32 y = 0; y < pool_y_size_; y++) { + for (int32 z = 0; z < pool_z_size_; z++) { + // given the local node coordinate, group them from each pool + // to form a patch + for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) { + for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) { + for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) { + KALDI_ASSERT(index < column_map_size); + column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ + + (y_pool * pool_y_step_ + y) * input_z_dim_ + + (z_pool * pool_z_step_ + z); + + } + } + } } } } CuArray cu_cols(column_map); - patches.CopyCols(in, cu_cols); - - // - // compute filter activations - // - - std::vector* > tgt_batch, patch_batch, filter_params_batch; - - CuSubMatrix* filter_params_elem = new CuSubMatrix( - filter_params_, 0, filter_params_.NumRows(), 0, - filter_params_.NumCols()); - - // form batch in vector container - for (int32 p = 0; p < num_patches; p++) { - // form batch in vector container. for filter_params_batch, all elements - // point to the same copy filter_params_elem - tgt_batch.push_back(new CuSubMatrix(out->ColRange(p * num_filters, - num_filters))); - patch_batch.push_back(new CuSubMatrix(patches.ColRange(p * filter_dim, - filter_dim))); - filter_params_batch.push_back(filter_params_elem); - - tgt_batch[p]->AddVecToRows(1.0, bias_params_, 1.0); // add bias - } - - // apply all filters - AddMatMatBatched(1.0, tgt_batch, patch_batch, kNoTrans, - filter_params_batch, kTrans, 1.0); - - // release memory - delete filter_params_elem; - for (int32 p = 0; p < num_patches; p++) { - delete tgt_batch[p]; - delete patch_batch[p]; - } -} - -// scale the parameters -void Convolutional1dComponent::Scale(BaseFloat scale) { - filter_params_.Scale(scale); - bias_params_.Scale(scale); -} - -// add another convolution component -void Convolutional1dComponent::Add(BaseFloat alpha, const Component &other_in) { - const Convolutional1dComponent *other = - dynamic_cast(&other_in); - KALDI_ASSERT(other != NULL); - filter_params_.AddMat(alpha, other->filter_params_); - bias_params_.AddVec(alpha, other->bias_params_); + patches->CopyCols(in, cu_cols); } /* - This function does an operation similar to reversing a map, - except it handles maps that are not one-to-one by outputting - the reversed map as a vector of lists. - @param[in] forward_indexes is a vector of int32, each of whose - elements is between 0 and input_dim - 1. - @param[in] input_dim. See definitions of forward_indexes and - backward_indexes. - @param[out] backward_indexes is a vector of dimension input_dim - of lists, The list at (backward_indexes[i]) is a list - of all indexes j such that forward_indexes[j] = i. + This is the 3d max pooling propagate function. + It is assumed that each row of the input matrix + is a vectorized 3D-tensor of type zxy. + Similar to the propagate function of ConvolutionComponent, + the input matrix is first arranged into patches so that + pools (with / without overlapping) could be + processed in a parallelizable manner. + The output matrix is also a vectorized 3D-tensor of type zxy. */ -void Convolutional1dComponent::ReverseIndexes(const std::vector &forward_indexes, - int32 input_dim, - std::vector > *backward_indexes) { - int32 i, size = forward_indexes.size(); - int32 reserve_size = 2 + size / input_dim; - backward_indexes->resize(input_dim); - std::vector >::iterator iter = backward_indexes->begin(), - end = backward_indexes->end(); - for (; iter != end; ++iter) - iter->reserve(reserve_size); - for (int32 j = 0; j < forward_indexes.size(); j++) { - i = forward_indexes[j]; - KALDI_ASSERT(i < input_dim); - (*backward_indexes)[i].push_back(j); - } -} -/* - This function transforms a vector of lists into a list of vectors, - padded with -1. - @param[in] The input vector of lists. Let in.size() be D, and let - the longest list length (i.e. the max of in[i].size()) be L. - @param[out] The output list of vectors. The length of the list will - be L, each vector-dimension will be D (i.e. out[i].size() == D), - and if in[i] == j, then for some k we will have that - out[k][j] = i. The output vectors are padded with -1 - where necessary if not all the input lists have the same side. -*/ -void Convolutional1dComponent::RearrangeIndexes(const std::vector > &in, - std::vector > *out) { - int32 D = in.size(); - int32 L = 0; - for (int32 i = 0; i < D; i++) - if (in[i].size() > L) - L = in[i].size(); - out->resize(L); - for (int32 i = 0; i < L; i++) - (*out)[i].resize(D, -1); - for (int32 i = 0; i < D; i++) { - for (int32 j = 0; j < in[i].size(); j++) { - (*out)[j][i] = in[i][j]; - } - } -} - - -// back propagation function -void Convolutional1dComponent::Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value, - const CuMatrixBase &out_deriv, - Component *to_update_in, - CuMatrixBase *in_deriv) const { - Convolutional1dComponent *to_update = dynamic_cast(to_update_in); - int32 num_splice = InputDim() / patch_stride_; - int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; - int32 num_filters = filter_params_.NumRows(); - int32 num_frames = out_deriv.NumRows(); - int32 filter_dim = filter_params_.NumCols(); - - /** Buffer for backpropagation: - * derivatives in the domain of 'patches_', - * 1row = vectorized rectangular feature patches, - * 1col = dim over speech frames, - */ - CuMatrix patches_deriv(num_frames, filter_dim * num_patches, kSetZero); - - // - // backpropagate to vector of matrices - // (corresponding to position of a filter) - // - std::vector* > patch_deriv_batch, out_deriv_batch, - filter_params_batch; - - CuSubMatrix* filter_params_elem = new CuSubMatrix( - filter_params_, 0, filter_params_.NumRows(), 0, - filter_params_.NumCols()); - - // form batch in vector container - for (int32 p = 0; p < num_patches; p++) { - // form batch in vector container. for filter_params_batch, all elements - // point to the same copy filter_params_elem - patch_deriv_batch.push_back(new CuSubMatrix(patches_deriv.ColRange( - p * filter_dim, filter_dim))); - out_deriv_batch.push_back(new CuSubMatrix(out_deriv.ColRange( - p * num_filters, num_filters))); - filter_params_batch.push_back(filter_params_elem); - } - AddMatMatBatched(1.0, patch_deriv_batch, out_deriv_batch, kNoTrans, - filter_params_batch, kNoTrans, 0.0); - - // release memory - delete filter_params_elem; - for (int32 p = 0; p < num_patches; p++) { - delete patch_deriv_batch[p]; - delete out_deriv_batch[p]; - } - - // sum the derivatives into in_deriv - std::vector column_map(filter_dim * num_patches); - for (int32 p = 0, index = 0; p < num_patches; p++) { - for (int32 s = 0; s < num_splice; s++) { - for (int32 d = 0; d < patch_dim_; d++, index++) { - column_map[index] = p * patch_step_ + s * patch_stride_ + d; - } - } - } - - if (in_deriv) { - std::vector > reversed_column_map; - ReverseIndexes(column_map, InputDim(), &reversed_column_map); - std::vector > rearranged_column_map; - RearrangeIndexes(reversed_column_map, &rearranged_column_map); - for (int32 p = 0; p < rearranged_column_map.size(); p++) { - CuArray cu_cols(rearranged_column_map[p]); - in_deriv->AddCols(patches_deriv, cu_cols); - } - } - - if (to_update != NULL) { - // Next update the model (must do this 2nd so the derivatives we propagate - // are accurate, in case this == to_update_in.) - to_update->Update(debug_info, in_value, out_deriv); - } -} - -void Convolutional1dComponent::SetZero(bool treat_as_gradient) { - if (treat_as_gradient) { - learning_rate_ = 1.0; // don't call SetLearningRate, that would apply the - // learning rate factor. - is_gradient_ = true; - } - filter_params_.SetZero(); - bias_params_.SetZero(); -} - -void Convolutional1dComponent::Read(std::istream &is, bool binary) { - ReadUpdatableCommon(is, binary); // Read opening tag and learning rate. - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &patch_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &patch_step_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &patch_stride_); - ExpectToken(is, binary, ""); - filter_params_.Read(is, binary); - ExpectToken(is, binary, ""); - bias_params_.Read(is, binary); - std::string tok; - ReadToken(is, binary, &tok); - if (tok == "") { - ReadBasicType(is, binary, &is_gradient_); - ExpectToken(is, binary, ""); - } else { - is_gradient_ = false; - KALDI_ASSERT(tok == ""); - } -} - -void Convolutional1dComponent::Write(std::ostream &os, bool binary) const { - WriteUpdatableCommon(os, binary); // Write opening tag and learning rate - WriteToken(os, binary, ""); - WriteBasicType(os, binary, patch_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, patch_step_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, patch_stride_); - WriteToken(os, binary, ""); - filter_params_.Write(os, binary); - WriteToken(os, binary, ""); - bias_params_.Write(os, binary); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, is_gradient_); - WriteToken(os, binary, ""); -} - -BaseFloat Convolutional1dComponent::DotProduct(const UpdatableComponent &other_in) const { - const Convolutional1dComponent *other = - dynamic_cast(&other_in); - return TraceMatMat(filter_params_, other->filter_params_, kTrans) - + VecVec(bias_params_, other->bias_params_); -} - -Component* Convolutional1dComponent::Copy() const { - Convolutional1dComponent *ans = new Convolutional1dComponent(); - ans->learning_rate_ = learning_rate_; - ans->patch_dim_ = patch_dim_; - ans->patch_step_ = patch_step_; - ans->patch_stride_ = patch_stride_; - ans->filter_params_ = filter_params_; - ans->bias_params_ = bias_params_; - ans->is_gradient_ = is_gradient_; - return ans; -} - -void Convolutional1dComponent::PerturbParams(BaseFloat stddev) { - CuMatrix temp_filter_params(filter_params_); - temp_filter_params.SetRandn(); - filter_params_.AddMat(stddev, temp_filter_params); +void MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + int32 num_frames = in.NumRows(); + int32 num_pools = OutputDim(); + int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_; + CuMatrix patches(num_frames, num_pools * pool_size, kUndefined); + InputToInputPatches(in, &patches); - CuVector temp_bias_params(bias_params_); - temp_bias_params.SetRandn(); - bias_params_.AddVec(stddev, temp_bias_params); + out->Set(-1e20); // reset a large negative value + for (int32 q = 0; q < pool_size; q++) + out->Max(patches.ColRange(q * num_pools, num_pools)); } -void Convolutional1dComponent::SetParams(const VectorBase &bias, - const MatrixBase &filter) { - bias_params_ = bias; - filter_params_ = filter; - KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows()); -} +// Method to compute the input derivative matrix from the input derivatives +// for patches, where each patch corresponds to +// the nodes having the same local coordinatenodes from each pool +void MaxpoolingComponent::InderivPatchesToInderiv( + const CuMatrix& in_deriv_patches, + CuMatrixBase *in_deriv) const { -int32 Convolutional1dComponent::NumParameters() const { - return (filter_params_.NumCols() + 1) * filter_params_.NumRows(); -} + int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_; + int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_; + int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_; -// update parameters -void Convolutional1dComponent::Update(const std::string &debug_info, - const CuMatrixBase &in_value, - const CuMatrixBase &out_deriv) { - // useful dims - int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; - int32 num_filters = filter_params_.NumRows(); - int32 filter_dim = filter_params_.NumCols(); - int32 num_frames = in_value.NumRows(); - int32 num_splice = InputDim() / patch_stride_; - CuMatrix filters_grad; - CuVector bias_grad; - - /** Buffer of reshaped inputs: - * 1row = vectorized rectangular feature patches - * 1col = dim over speech frames, - */ - CuMatrix patches(num_frames, filter_dim * num_patches, kUndefined); - std::vector column_map(filter_dim * num_patches); - for (int32 p = 0, index = 0; p < num_patches; p++) { - for (int32 s = 0; s < num_splice; s++) { - for (int32 d = 0; d < patch_dim_; d++, index++) { - column_map[index] = p * patch_step_ + s * patch_stride_ + d; + std::vector > reverse_column_map(in_deriv->NumCols()); + int32 rev_col_map_size = reverse_column_map.size(); + for (int32 x = 0, index = 0; x < pool_x_size_; x++) { + for (int32 y = 0; y < pool_y_size_; y++) { + for (int32 z = 0; z < pool_z_size_; z++) { + + for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) { + for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) { + for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) { + int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ + + (y_pool * pool_y_step_ + y) * input_z_dim_ + + (z_pool * pool_z_step_ + z); + + KALDI_ASSERT(vector_index < rev_col_map_size); + reverse_column_map[vector_index].push_back(index); + } + } + } } } } - CuArray cu_cols(column_map); - patches.CopyCols(in_value, cu_cols); - // - // calculate the gradient - // - filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset - bias_grad.Resize(num_filters, kSetZero); // reset - - // - // use all the patches - // - - // create a single large matrix holding the smaller matrices - // from the vector container filters_grad_batch along the rows - CuMatrix filters_grad_blocks_batch( - num_patches * filters_grad.NumRows(), filters_grad.NumCols()); - - std::vector* > filters_grad_batch, diff_patch_batch, - patch_batch; - for (int32 p = 0; p < num_patches; p++) { - // form batch in vector container - filters_grad_batch.push_back(new CuSubMatrix( - filters_grad_blocks_batch.RowRange( - p * filters_grad.NumRows(), - filters_grad.NumRows()))); - diff_patch_batch.push_back(new CuSubMatrix(out_deriv.ColRange( - p * num_filters, num_filters))); - patch_batch.push_back(new CuSubMatrix(patches.ColRange( - p * filter_dim, filter_dim))); - } - - AddMatMatBatched(1.0, filters_grad_batch, diff_patch_batch, kTrans, patch_batch, - kNoTrans, 1.0); - - // add the row blocks together to filters_grad - filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch); - - // create a matrix holding the col blocks sum of out_deriv - CuMatrix out_deriv_col_blocks_sum(out_deriv.NumRows(), num_filters); - - // add the col blocks together to out_deriv_col_blocks_sum - out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv); - - bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0); - - // release memory - for (int32 p = 0; p < num_patches; p++) { - delete filters_grad_batch[p]; - delete diff_patch_batch[p]; - delete patch_batch[p]; - } - - // - // update - // - filter_params_.AddMat(learning_rate_, filters_grad); - bias_params_.AddVec(learning_rate_, bias_grad); -} - -void Convolutional1dComponent::Vectorize(VectorBase *params) const { - KALDI_ASSERT(params->Dim() == this->NumParameters()); - int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows(); - params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_); - params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_); -} -void Convolutional1dComponent::UnVectorize(const VectorBase ¶ms) { - KALDI_ASSERT(params.Dim() == this->NumParameters()); - int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows(); - filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params)); - bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim())); -} - -void MaxpoolingComponent::Init(int32 input_dim, int32 output_dim, - int32 pool_size, int32 pool_stride) { - input_dim_ = input_dim; - output_dim_ = output_dim; - pool_size_ = pool_size; - pool_stride_ = pool_stride; - - // sanity check - // number of patches - KALDI_ASSERT(input_dim_ % pool_stride_ == 0); - int32 num_patches = input_dim_ / pool_stride_; - // number of pools - KALDI_ASSERT(num_patches % pool_size_ == 0); - int32 num_pools = num_patches / pool_size_; - // check output dim - KALDI_ASSERT(output_dim_ == num_pools * pool_stride_); -} - -void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) { - int32 input_dim = 0; - int32 output_dim = 0; - int32 pool_size = -1, pool_stride = -1; - bool ok = true; - - ok = ok && cfl->GetValue("input-dim", &input_dim); - ok = ok && cfl->GetValue("output-dim", &output_dim); - ok = ok && cfl->GetValue("pool-size", &pool_size); - ok = ok && cfl->GetValue("pool-stride", &pool_stride); - - KALDI_LOG << output_dim << " " << input_dim << " " << ok; - KALDI_LOG << "Pool: " << pool_size << " " - << pool_stride << " " << ok; - if (cfl->HasUnusedValues()) - KALDI_ERR << "Could not process these elements in initializer: " - << cfl->UnusedValues(); - if (!ok || output_dim <= 0) - KALDI_ERR << "Invalid initializer for layer of type " - << Type() << ": \"" << cfl->WholeLine() << "\""; - Init(input_dim, output_dim, pool_size, pool_stride); -} - -void MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - int32 num_patches = input_dim_ / pool_stride_; - int32 num_pools = num_patches / pool_size_; - - // do the max-pooling - for (int32 q = 0; q < num_pools; q++) { - // get output buffer of the pool - CuSubMatrix pool(out->ColRange(q * pool_stride_, pool_stride_)); - pool.Set(-1e20); // reset a large negative value - for (int32 r = 0; r < pool_size_; r++) { - // col-by-col block comparison pool - int32 p = r + q * pool_size_; - pool.Max(in.ColRange(p * pool_stride_, pool_stride_)); - } + std::vector > rearranged_column_map; + RearrangeIndexes(reverse_column_map, &rearranged_column_map); + for (int32 p = 0; p < rearranged_column_map.size(); p++) { + CuArray cu_cols(rearranged_column_map[p]); + in_deriv->AddCols(in_deriv_patches, cu_cols); } } +/* + 3d max pooling backpropagate function + This function backpropagate the error from + out_deriv to in_deriv. + In order to select the node in each pool to + backpropagate the error, it has to compare + the output pool value stored in the out_value + matrix with each of its input pool member node + stroed in the in_value matrix. +*/ void MaxpoolingComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, @@ -4198,66 +3729,87 @@ void MaxpoolingComponent::Backprop(const std::string &debug_info, const CuMatrixBase &out_deriv, Component *, // to_update, CuMatrixBase *in_deriv) const { - int32 num_patches = input_dim_ / pool_stride_; - int32 num_pools = num_patches / pool_size_; - std::vector patch_summands(num_patches, 0); - - for(int32 q = 0; q < num_pools; q++) { - for(int32 r = 0; r < pool_size_; r++) { - int32 p = r + q * pool_size_; - CuSubMatrix in_p(in_value.ColRange(p * pool_stride_, pool_stride_)); - CuSubMatrix out_q(out_value.ColRange(q * pool_stride_, pool_stride_)); - CuSubMatrix tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_)); - CuMatrix src(out_deriv.ColRange(q * pool_stride_, pool_stride_)); - // zero-out mask - CuMatrix mask; - in_p.EqualElementMask(out_q, &mask); - src.MulElements(mask); - tgt.AddMat(1.0, src); - // summed deriv info - patch_summands[p] += 1; - } - } + if (!in_deriv) + return; - // scale in_deriv of overlaped pools - for(int32 p = 0; p < num_patches; p++) { - CuSubMatrix tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_)); - KALDI_ASSERT(patch_summands[p] > 0); - tgt.Scale(1.0 / patch_summands[p]); + int32 num_frames = in_value.NumRows(); + int32 num_pools = OutputDim(); + int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_; + CuMatrix patches(num_frames, num_pools * pool_size, kUndefined); + InputToInputPatches(in_value, &patches); + + for (int32 q = 0; q < pool_size; q++) { + // zero-out mask + CuMatrix mask; + out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask); + mask.MulElements(out_deriv); + patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask); } + + // combine the derivatives from the individual input deriv patches + // to compute input deriv matrix + InderivPatchesToInderiv(patches, in_deriv); } void MaxpoolingComponent::Read(std::istream &is, bool binary) { - ExpectOneOrTwoTokens(is, binary, "", ""); - ReadBasicType(is, binary, &input_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &output_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &pool_size_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &pool_stride_); + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &input_x_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &input_y_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &input_z_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_x_size_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_y_size_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_z_size_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_x_step_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_y_step_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &pool_z_step_); ExpectToken(is, binary, ""); + Check(); } void MaxpoolingComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, output_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, pool_size_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, pool_stride_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_x_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_y_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_z_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_x_size_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_y_size_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_z_size_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_x_step_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_y_step_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, pool_z_step_); WriteToken(os, binary, ""); } +// display information about component std::string MaxpoolingComponent::Info() const { std::ostringstream stream; - stream << Type() << ", input-dim = " << input_dim_ - << ", output-dim = " << output_dim_ - << ", pool-size = " << pool_size_ - << ", pool-stride = " << pool_stride_; + stream << Type() + << ", input-x-dim = " << input_x_dim_ + << ", input-y-dim = " << input_y_dim_ + << ", input-z-dim = " << input_z_dim_ + << ", pool-x-size = " << pool_x_size_ + << ", pool-y-size = " << pool_y_size_ + << ", pool-z-size = " << pool_z_size_ + << ", pool-x-step = " << pool_x_step_ + << ", pool-y-step = " << pool_y_step_ + << ", pool-z-step = " << pool_z_step_; return stream.str(); } diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index a78f72c0afb..bb15afbcc39 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -6,6 +6,7 @@ // 2014-2015 Vijayaditya Peddinti // 2014-2015 Guoguo Chen // 2015 Daniel Galvez +// 2015 Tom Ko // See ../../COPYING for clarification regarding multiple authors // @@ -1445,161 +1446,65 @@ class ConvolutionComponent: public UpdatableComponent { }; /** - * Convolutional1dComponent implements convolution over frequency axis. - * We assume the input featrues are spliced, i.e. each frame is in - * fact a set of stacked frames, where we can form patches which span - * over several frequency bands and whole time axis. A patch is the - * instance of a filter on a group of frequency bands and whole time - * axis. Shifts of the filter generate patches. + * MaxPoolingComponent : + * Maxpooling component was firstly used in ConvNet for selecting an + * representative activation in an area. It inspired Maxout nonlinearity. + * Each output element of this component is the maximum of a block of + * input elements where the block has a 3D dimension (pool_x_size_, + * pool_y_size_, pool_z_size_). + * Blocks could overlap if the shift value on any axis is smaller + * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_). + * If the shift values are euqal to their pool size, there is no + * overlap; while if they all equal 1, the blocks overlap to + * the greatest possible extent. * - * The convolution is done over whole axis with same filter - * coefficients, i.e. we don't use separate filters for different - * 'regions' of frequency axis. Due to convolution, same weights are - * used repeateadly, the final gradient is a sum of all - * position-specific gradients (the sum was found better than - * averaging). + * This component is designed to be used after a ConvolutionComponent + * so that the input matrix is propagated from a 2d-convolutional layer. + * This component implements 3d-maxpooling which performs + * max pooling along the three axes. + * Input : A matrix where each row is a vectorized 3D-tensor. + * The 3D tensor has dimensions + * x: (e.g. time) + * y: (e.g. frequency) + * z: (e.g. channels like number of filters in the ConvolutionComponent) * - * In order to have a fast implementations, the filters are - * represented in vectorized form, where each rectangular filter - * corresponds to a row in a matrix, where all the filters are - * stored. The features are then re-shaped to a set of matrices, where - * one matrix corresponds to single patch-position, where all the - * filters get applied. + * The component assumes input vectorizations of type zyx + * which is the default output vectorization type of a ConvolutionComponent. + * e.g. for input vectorization of type zyx the input is vectorized by + * spanning axes z, y and x of the tensor in that order. + * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions + * the zyx vectorized input looks like + * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1) + * + * Output : The output is also a 3D tensor vectorized in the zyx format. + * + * For information on the hyperparameters and parameters of this component see + * the variable declarations. * - * The type of convolution is controled by hyperparameters: - * patch_dim_ ... frequency axis size of the patch - * patch_step_ ... size of shift in the convolution - * patch_stride_ ... shift for 2nd dim of a patch - * (i.e. frame length before splicing) - * For instance, for a convolutional component after raw input, - * if the input is 36-dim fbank feature with delta of order 2 - * and spliced using +/- 5 frames of contexts, the convolutional - * component takes the input as a 36 x 33 image. The patch_stride_ - * should be configured 36. If patch_step_ and patch_dim_ are - * configured 1 and 7, the Convolutional1dComponent creates a - * 2D filter of 7 x 33, such that the convolution is actually done - * only along the frequency axis. Specifically, the convolutional - * output along the frequency axis is (36 - 7) / 1 + 1 = 30, and - * the convolutional output along the temporal axis is 33 - 33 + 1 = 1, - * resulting in an output image of 30 x 1, which is called a feature map - * in ConvNet. Then if the output-dim is set 3840, the constructor - * would know there should be 3840 / 30 = 128 distinct filters, - * which will create 128 feature maps of 30 x 1 for one frame of - * input. The feature maps are vectorized as a 3840-dim row vector - * in the output matrix of this component. For details on progatation - * of Convolutional1dComponent, check the function definition. * */ -class Convolutional1dComponent: public UpdatableComponent { + +class MaxpoolingComponent: public Component { public: - Convolutional1dComponent(); + + MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0), + pool_x_size_(0), pool_y_size_(0), pool_z_size_(0), + pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { } // constructor using another component - Convolutional1dComponent(const Convolutional1dComponent &component); - // constructor using parameters - Convolutional1dComponent(const CuMatrixBase &filter_params, - const CuVectorBase &bias_params, - BaseFloat learning_rate); + MaxpoolingComponent(const MaxpoolingComponent &component): + input_x_dim_(component.input_x_dim_), + input_y_dim_(component.input_y_dim_), + input_z_dim_(component.input_z_dim_), + pool_x_size_(component.pool_x_size_), + pool_y_size_(component.pool_y_size_), + pool_z_size_(component.pool_z_size_), + pool_x_step_(component.pool_x_step_), + pool_y_step_(component.pool_y_step_), + pool_z_step_(component.pool_z_step_) { } virtual int32 InputDim() const; virtual int32 OutputDim() const; - - virtual std::string Info() const; - virtual void InitFromConfig(ConfigLine *cfl); - virtual std::string Type() const { return "Convolutional1dComponent"; } - virtual int32 Properties() const { - return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput| - kBackpropAdds|kPropagateAdds; - } - - virtual void Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value, - const CuMatrixBase &out_deriv, - Component *to_update_in, - CuMatrixBase *in_deriv) const; - - virtual void Read(std::istream &is, bool binary); - virtual void Write(std::ostream &os, bool binary) const; - - virtual Component* Copy() const; - - // Some functions from base-class UpdatableComponent. - virtual void Scale(BaseFloat scale); - virtual void Add(BaseFloat alpha, const Component &other); - virtual void SetZero(bool treat_as_gradient); - virtual void PerturbParams(BaseFloat stddev); - virtual BaseFloat DotProduct(const UpdatableComponent &other) const; - virtual int32 NumParameters() const; - virtual void Vectorize(VectorBase *params) const; - virtual void UnVectorize(const VectorBase ¶ms); - - // Some functions that are specific to this class. - void SetParams(const VectorBase &bias, - const MatrixBase &filter); - const CuVector &BiasParams() { return bias_params_; } - const CuMatrix &LinearParams() { return filter_params_; } - void Init(int32 input_dim, int32 output_dim, - int32 patch_dim, int32 patch_step, int32 patch_stride, - BaseFloat param_stddev, BaseFloat bias_stddev); - void Init(int32 patch_dim, int32 patch_step, int32 patch_stride, - std::string matrix_filename); - - // resize the component, setting the parameters to zero, while - // leaving any other configuration values the same - void Resize(int32 input_dim, int32 output_dim); - - void Update(const std::string &debug_info, - const CuMatrixBase &in_value, - const CuMatrixBase &out_deriv); - - private: - int32 patch_dim_; - int32 patch_step_; - int32 patch_stride_; - - static void ReverseIndexes(const std::vector &forward_indexes, - int32 input_dim, - std::vector > *backward_indexes); - static void RearrangeIndexes(const std::vector > &in, - std::vector > *out); - - const Convolutional1dComponent &operator = (const Convolutional1dComponent &other); // Disallow. - CuMatrix filter_params_; - CuVector bias_params_; - bool is_gradient_; -}; - -/** - * MaxPoolingComponent : - * Maxpooling component was firstly used in ConvNet for selecting an representative - * activation in an area. It inspired Maxout nonlinearity. - * - * The input/output matrices are split to submatrices with width 'pool_stride_'. - * For instance, a minibatch of 512 frames is propagated by a convolutional - * layer, resulting in a 512 x 3840 input matrix for MaxpoolingComponent, - * which is composed of 128 feature maps for each frame (128 x 30). If you want - * a 3-to-1 maxpooling on each feature map, set 'pool_stride_' and 'pool_size_' - * as 128 and 3 respectively. Maxpooling component would create an output - * matrix of 512 x 1280. The 30 input neurons are grouped by a group size of 3, and - * the maximum in a group is selected, creating a smaller feature map of 10. - * - * Our pooling does not supports overlaps, which simplifies the - * implementation (and was not helpful for Ossama). - */ -class MaxpoolingComponent: public Component { - public: - explicit MaxpoolingComponent(int32 input_dim, int32 output_dim, - int32 pool_size, int32 pool_stride) { - Init(input_dim, output_dim, pool_size, pool_stride); - } - MaxpoolingComponent(): input_dim_(0), output_dim_(0), - pool_size_(0), pool_stride_(0) { } - virtual int32 InputDim() const { return input_dim_; } - virtual int32 OutputDim() const { return output_dim_; } + virtual void Check() const; virtual std::string Info() const; virtual void InitFromConfig(ConfigLine *cfl); @@ -1625,23 +1530,35 @@ class MaxpoolingComponent: public Component { /// Write component to stream virtual void Write(std::ostream &os, bool binary) const; - // We don't implement InitFromConfig() at this level: child-class should do - // it. - virtual Component* Copy() const { - return new MaxpoolingComponent(input_dim_, output_dim_, - pool_size_, pool_stride_); } + virtual Component* Copy() const { return new MaxpoolingComponent(*this); } - // Some functions that are specific to this class - void Init(int32 input_dim, int32 output_dim, - int32 pool_size, int32 pool_stride); + void InputToInputPatches(const CuMatrixBase& in, + CuMatrix *patches) const; + void InderivPatchesToInderiv(const CuMatrix& in_deriv_patches, + CuMatrixBase *in_deriv) const; protected: - int32 input_dim_; - int32 output_dim_; - int32 pool_size_; - int32 pool_stride_; + int32 input_x_dim_; // size of the input along x-axis + // (e.g. number of time steps) + int32 input_y_dim_; // size of input along y-axis + // (e.g. number of mel-frequency bins) + int32 input_z_dim_; // size of input along z-axis + // (e.g. number of filters in the ConvolutionComponent) + + int32 pool_x_size_; // size of the pooling window along x-axis + int32 pool_y_size_; // size of the pooling window along y-axis + int32 pool_z_size_; // size of the pooling window along z-axis + + int32 pool_x_step_; // the number of steps taken along x-axis of input + // before computing the next pool + int32 pool_y_step_; // the number of steps taken along y-axis of input + // before computing the next pool + int32 pool_z_step_; // the number of steps taken along z-axis of input + // before computing the next pool + }; + /** CompositeComponent is components representing a sequence of [simple] components. The config line would be something like the following diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index 97720a0f2c0..cce697fa355 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -614,38 +614,6 @@ void GenerateConfigSequenceCnn( std::vector *configs) { std::ostringstream os; - int32 pool_stride = 5 + Rand() % 10, pool_size = 2 + Rand() % 3, - num_pools = 1 + Rand() % 10; - int32 num_patches = num_pools * pool_size; - int32 patch_step = 1 + Rand() % 4, patch_dim = 4 + Rand () % 5, - patch_stride = (num_patches - 1) * patch_step + patch_dim; - int32 num_splice = 5 + Rand() % 10, num_filters = pool_stride; - - int32 input_dim = patch_stride * num_splice, - hidden_dim = num_patches * num_filters, - output_dim = num_pools * pool_stride; - - os << "component name=conv type=Convolutional1dComponent input-dim=" - << input_dim << " output-dim=" << hidden_dim - << " patch-dim=" << patch_dim << " patch-step=" << patch_step - << " patch-stride=" << patch_stride << std::endl; - os << "component name=maxpooling type=MaxpoolingComponent input-dim=" - << hidden_dim << " output-dim=" << output_dim - << " pool-size=" << pool_size << " pool-stride=" << pool_stride - << std::endl; - - os << "input-node name=input dim=" << input_dim << std::endl; - os << "component-node name=conv_node component=conv input=input\n"; - os << "component-node name=maxpooling_node component=maxpooling input=conv_node\n"; - os << "output-node name=output input=maxpooling_node\n"; - configs->push_back(os.str()); -} - -void GenerateConfigSequenceCnn2d( - const NnetGenerationOptions &opts, - std::vector *configs) { - std::ostringstream os; - int32 input_x_dim = 10 + Rand() % 20, input_y_dim = 10 + Rand() % 20, @@ -682,8 +650,40 @@ void GenerateConfigSequenceCnn2d( << " input-vectorization-order=" << vectorization << std::endl; + int32 conv_output_x_dim = (1 + (input_x_dim - filt_x_dim) / filt_x_step); + int32 conv_output_y_dim = (1 + (input_y_dim - filt_y_dim) / filt_y_step); + int32 conv_output_z_dim = num_filters; + int32 pool_x_size = 1 + Rand() % conv_output_x_dim; + int32 pool_y_size = 1 + Rand() % conv_output_y_dim; + int32 pool_z_size = 1 + Rand() % conv_output_z_dim; + int32 pool_x_step = 1; + int32 pool_y_step = 1; + int32 pool_z_step = 1; + do { + pool_x_step = (1 + Rand() % pool_x_size); + } while((conv_output_x_dim - pool_x_size) % pool_x_step); + do { + pool_y_step = (1 + Rand() % pool_y_size); + } while((conv_output_y_dim - pool_y_size) % pool_y_step); + do { + pool_z_step = (1 + Rand() % pool_z_size); + } while((conv_output_z_dim - pool_z_size) % pool_z_step); + + os << "component name=maxpooling type=MaxpoolingComponent " + << " input-x-dim=" << conv_output_x_dim + << " input-y-dim=" << conv_output_y_dim + << " input-z-dim=" << conv_output_z_dim + << " pool-x-size=" << pool_x_size + << " pool-y-size=" << pool_y_size + << " pool-z-size=" << pool_z_size + << " pool-x-step=" << pool_x_step + << " pool-y-step=" << pool_y_step + << " pool-z-step=" << pool_z_step + << std::endl; + os << "input-node name=input dim=" << (input_x_dim * input_y_dim * input_z_dim) << std::endl; os << "component-node name=conv_node component=conv input=input\n"; + os << "component-node name=maxpooling_node component=maxpooling input=conv_node\n"; os << "output-node name=output input=conv_node\n"; configs->push_back(os.str()); } @@ -718,7 +718,7 @@ void GenerateConfigSequence( const NnetGenerationOptions &opts, std::vector *configs) { start: - int32 network_type = RandInt(0, 9); + int32 network_type = RandInt(0, 8); switch(network_type) { case 0: GenerateConfigSequenceSimplest(opts, configs); @@ -763,11 +763,6 @@ void GenerateConfigSequence( GenerateConfigSequenceCnn(opts, configs); break; case 8: - if (!opts.allow_nonlinearity) - goto start; - GenerateConfigSequenceCnn2d(opts, configs); - break; - case 9: GenerateConfigSequenceDistribute(opts, configs); break; default: @@ -834,7 +829,7 @@ void ComputeExampleComputationRequestSimple( static void GenerateRandomComponentConfig(std::string *component_type, std::string *config) { - int32 n = RandInt(0, 27); + int32 n = RandInt(0, 26); BaseFloat learning_rate = 0.001 * RandInt(1, 3); std::ostringstream os; @@ -945,25 +940,6 @@ static void GenerateRandomComponentConfig(std::string *component_type, break; } case 17: { - *component_type = "Convolutional1dComponent"; - int32 patch_stride = 10 + Rand() % 50, patch_step = 1 + Rand() % 4, - patch_dim = 4 + Rand () % 5; - - // decrease patch_stride so that - // (patch_stride - patch_dim) % patch_step == 0 - patch_stride = patch_stride - ((patch_stride - patch_dim) % patch_step); - - int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step; - int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10; - int32 input_dim = patch_stride * num_splice; - int32 output_dim = num_patches * num_filters; - os << "input-dim=" << input_dim << " output-dim=" << output_dim - << " patch-dim=" << patch_dim << " patch-step=" << patch_step - << " patch-stride=" << patch_stride - << " learning-rate=" << learning_rate; - break; - } - case 18: { int32 input_vectorization = Rand() % 2; std::string vectorization; if (input_vectorization == 0) { @@ -1001,19 +977,7 @@ static void GenerateRandomComponentConfig(std::string *component_type, // TODO : add test for file based initialization. But confirm how to write // a file which is not going to be overwritten by other components } - case 19: { - *component_type = "MaxpoolingComponent"; - int32 pool_stride = 5 + Rand() % 10, - pool_size = 2 + Rand() % 3, - num_pools = 1 + Rand() % 10; - int32 output_dim = num_pools * pool_stride; - int32 num_patches = num_pools * pool_size; - int32 input_dim = pool_stride * num_patches; - os << "input-dim=" << input_dim << " output-dim=" << output_dim - << " pool-size=" << pool_size << " pool-stride=" << pool_stride; - break; - } - case 20: { + case 18: { *component_type = "PermuteComponent"; int32 input_dim = 10 + Rand() % 100; std::vector column_map(input_dim); @@ -1027,7 +991,7 @@ static void GenerateRandomComponentConfig(std::string *component_type, os << "column-map=" << buffer.str(); break; } - case 21: { + case 19: { *component_type = "PerElementOffsetComponent"; std::string param_config = RandInt(0, 1)? " param-mean=0.0 param-stddev=0.0": @@ -1036,14 +1000,14 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " learning-rate=" << learning_rate << param_config; break; } - case 22: { + case 20: { *component_type = "SumReduceComponent"; int32 output_dim = RandInt(1, 50), group_size = RandInt(1, 15), input_dim = output_dim * group_size; os << "input-dim=" << input_dim << " output-dim=" << output_dim; break; } - case 23: { + case 21: { *component_type = "CompositeComponent"; int32 cur_dim = RandInt(20, 30), num_components = RandInt(1, 3), max_rows_process = RandInt(1, 30); @@ -1066,14 +1030,14 @@ static void GenerateRandomComponentConfig(std::string *component_type, } break; } - case 24: { + case 22: { *component_type = "SumGroupComponent"; int32 num_groups = RandInt(1, 50), input_dim = num_groups * RandInt(1, 15); os << "input-dim=" << input_dim << " output-dim=" << num_groups; break; } - case 25: { + case 23: { *component_type = "RepeatedAffineComponent"; int32 num_repeats = RandInt(1, 50), input_dim = num_repeats * RandInt(1, 15), @@ -1082,7 +1046,7 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " num-repeats=" << num_repeats; break; } - case 26: { + case 24: { *component_type = "BlockAffineComponent"; int32 num_blocks = RandInt(1, 50), input_dim = num_blocks * RandInt(1, 15), @@ -1091,7 +1055,7 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " num-blocks=" << num_blocks; break; } - case 27: { + case 25: { *component_type = "NaturalGradientRepeatedAffineComponent"; int32 num_repeats = RandInt(1, 50), input_dim = num_repeats * RandInt(1, 15), @@ -1100,6 +1064,35 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " num-repeats=" << num_repeats; break; } + case 26: { + *component_type = "MaxpoolingComponent"; + int32 input_x_dim = 5 + Rand() % 10, + input_y_dim = 5 + Rand() % 10, + input_z_dim = 5 + Rand() % 10; + int32 pool_x_size = 1 + Rand() % input_x_dim, + pool_y_size = 1 + Rand() % input_y_dim, + pool_z_size = 1 + Rand() % input_z_dim; + int32 pool_x_step = (1 + Rand() % pool_x_size), + pool_y_step = (1 + Rand() % pool_y_size), + pool_z_step = (1 + Rand() % pool_z_size); + // adjusting input dim to ensure divisibility + int32 remainder = (input_x_dim - pool_x_size) % pool_x_step; + input_x_dim = input_x_dim - remainder; + remainder = (input_y_dim - pool_y_size) % pool_y_step; + input_y_dim = input_y_dim - remainder; + remainder = (input_z_dim - pool_z_size) % pool_z_step; + input_z_dim = input_z_dim - remainder; + os << " input-x-dim=" << input_x_dim + << " input-y-dim=" << input_y_dim + << " input-z-dim=" << input_z_dim + << " pool-x-size=" << pool_x_size + << " pool-y-size=" << pool_y_size + << " pool-z-size=" << pool_z_size + << " pool-x-step=" << pool_x_step + << " pool-y-step=" << pool_y_step + << " pool-z-step=" << pool_z_step; + break; + } default: KALDI_ERR << "Error generating random component"; }