diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 41288b411c0..3b4fd527ad6 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -2098,20 +2098,25 @@ static void _diff_xent(const int32_cuda* vec_tgt, Real* mat_net_out, Real* vec_l template __global__ static void _compute_xvector_objf(const Real* scores, MatrixDim scores_dim, - Real* obfj_terms, MatrixDim objf_dim, - Real* obfj_derivs, MatrixDim derivs_dim) { + Real* objf_terms, MatrixDim objf_dim, + Real* objf_derivs, MatrixDim derivs_dim) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; int32_cuda scores_index = i + j * scores_dim.stride; + int32_cuda objf_index = i + j * objf_dim.stride; + int32_cuda derivs_index = i + j * derivs_dim.stride; Real K = 1.0 / (scores_dim.rows - 2.0); Real L = scores[scores_index]; if (i < scores_dim.cols && j < scores_dim.rows) { if (i + 1 == j && i % 2 == 0) { - obfj_terms[scores_index] = log(1.0 + exp(-L)); - obfj_derivs[scores_index] = 1.0 / (1.0 + exp(L)); + objf_terms[objf_index] = L < -15 ? -L : log(1.0 + exp(-L)); + objf_derivs[derivs_index] = 1.0 / (1.0 + exp(L)); } else if (i < j) { - obfj_terms[scores_index] = K * log(1.0 + exp(L)); - obfj_derivs[scores_index] = -K / (1.0 + exp(-L)); + objf_terms[objf_index] = K * (L > 15 ? L : log(1.0 + exp(L))); + objf_derivs[derivs_index] = -K / (1.0 + exp(-L)); + } else { + objf_terms[objf_index] = 0.0; + objf_derivs[derivs_index] = 0.0; } } } diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index 005bb3146c0..e655dde8a0f 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -227,16 +227,21 @@ void ComputeXvectorObjfFromScores(const CuMatrixBase &scores, } else #endif { + // Compute the xvector objective function and its derivatives in the CPU. int32 num_rows = scores.NumRows(); BaseFloat K = 1.0 / (num_rows - 2.0); for (int32 i = 0; i < num_rows; i++) { - for (int32 j = i + 1; j < num_rows; j++) { + for (int32 j = 0; j < num_rows; j++) { + BaseFloat L = scores(i, j); if (i + 1 == j && i % 2 == 0) { - (*objf_terms)(i, j) = log(1.0 + exp(-scores(i, j))); - (*objf_derivs)(i, j) = 1.0 / (1.0 + exp(scores(i, j))); + (*objf_terms)(i, j) = L < -15 ? -L : log(1.0 + exp(-L)); + (*objf_derivs)(i, j) = 1.0 / (1.0 + exp(L)); + } else if (i < j) { + (*objf_terms)(i, j) = K * (L > 15 ? L : log(1.0 + exp(L))); + (*objf_derivs)(i, j) = -K / (1.0 + exp(-L)); } else { - (*objf_terms)(i, j) = K * log(1.0 + exp(scores(i, j))); - (*objf_derivs)(i, j) = -K / (1.0 + exp(-scores(i, j))); + (*objf_terms)(i, j) = 0; + (*objf_derivs)(i, j) = 0; } } } diff --git a/src/ivector/xvector-test.cc b/src/ivector/xvector-test.cc index ae3b6d7e57b..a63f0532a7e 100644 --- a/src/ivector/xvector-test.cc +++ b/src/ivector/xvector-test.cc @@ -40,7 +40,7 @@ void TestComputeXvectorObjfAndDeriv( BaseFloat *tot_weight); bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) { - int32 xvector_dim = RandInt(4, 50), + int32 xvector_dim = RandInt(4, 100), num_rows = 2 * RandInt(2, 10); // The number of rows must be even // and greater than 2. CuSpMatrix S(xvector_dim); @@ -126,7 +126,7 @@ bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) { } bool TestXvectorComputeObjf() { - int32 xvector_dim = RandInt(4, 40), + int32 xvector_dim = RandInt(4, 100), num_rows = 2 * RandInt(2, 10); // The number of rows must be even // and greater than 2. CuSpMatrix S(xvector_dim); diff --git a/src/ivector/xvector.cc b/src/ivector/xvector.cc index c06942d1cb6..57a9c69d604 100644 --- a/src/ivector/xvector.cc +++ b/src/ivector/xvector.cc @@ -46,10 +46,10 @@ void ComputeXvectorObjfAndDeriv( P(N, xvector_dim), Q(N, N), R(N, N), - scores(N, N), // The raw scores. - objf_terms(N, N), - objf_deriv_terms(N, N); // Derivative of the - // objf w.r.t. the scores. + scores(N, N), // The raw scores. + objf_terms(N, N, kUndefined), + objf_deriv_terms(N, N, // Derivative of the + kUndefined); // objf w.r.t. the scores. CuVector r(N); P.AddMatMat(1.0, xvector_pairs, kNoTrans, S_tmp, kNoTrans, 0.0); diff --git a/src/ivector/xvector.h b/src/ivector/xvector.h index 53d0864575a..9b4b79de54a 100644 --- a/src/ivector/xvector.h +++ b/src/ivector/xvector.h @@ -32,7 +32,7 @@ namespace kaldi { /* Computes the training objective function and the derivatives for the xvector. Let N = xvector_pairs.NumRows() be the number of - xvectors. There are N(N-1)/2 pairs in total and N from the same + xvectors. There are N(N-1)/2 pairs in total and N/2 from the same class. Let v(n) be the n'th row of the matrix xvector_pairs. The total objective function written to 'tot_objf' is \sum_{n=0}^{N/2} p_same(v(n*2), v(n*2+1)) @@ -61,9 +61,9 @@ namespace kaldi { the objective function with respect to the parameter b is written here. @param [out] tot_objf The total objective function described above @param [out] tot_weight The total normalizing factor for the objective - function, equal to dvector_pairs.NumRows(). + function, equal to xvector_pairs.NumRows(). */ - void ComputeXvectorObjfAndDeriv(const CuMatrixBase &dvector_pairs, + void ComputeXvectorObjfAndDeriv(const CuMatrixBase &xvector_pairs, const CuSpMatrix &S, BaseFloat b, CuMatrixBase *deriv_xvector, diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile index 0a57c17fad0..f8a02207b1c 100644 --- a/src/nnet3bin/Makefile +++ b/src/nnet3bin/Makefile @@ -12,7 +12,7 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \ nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \ nnet3-average nnet3-am-info nnet3-combine nnet3-latgen-faster \ nnet3-copy nnet3-show-progress nnet3-align-compiled \ - nnet3-get-egs-dense-targets nnet3-compute + nnet3-get-egs-dense-targets nnet3-compute nnet3-xvector-get-egs OBJFILES = diff --git a/src/nnet3bin/nnet3-xvector-get-egs.cc b/src/nnet3bin/nnet3-xvector-get-egs.cc new file mode 100644 index 00000000000..24e50560b54 --- /dev/null +++ b/src/nnet3bin/nnet3-xvector-get-egs.cc @@ -0,0 +1,240 @@ +// nnet3bin/nnet3-xvector-get-egs.cc + +// Copyright 2012-2016 Johns Hopkins University (author: Daniel Povey) +// 2016 David Snyder + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "util/common-utils.h" +#include "nnet3/nnet-example.h" + +namespace kaldi { +namespace nnet3 { + +// A struct for holding information about the position and +// duration of each pair of chunks. +struct ChunkPairInfo { + std::string pair_name; + int32 output_archive_id; + int32 start_frame1; + int32 start_frame2; + int32 num_frames1; + int32 num_frames2; +}; + +// Process the range input file and store it as a map from utterance +// name to vector of ChunkPairInfo structs. +static void ProcessRangeFile(const std::string &range_rxfilename, + unordered_map > *utt_to_pairs) { + Input range_input(range_rxfilename); + if (!range_rxfilename.empty()) { + std::string line; + while (std::getline(range_input.Stream(), line)) { + ChunkPairInfo *pair = new ChunkPairInfo(); + std::vector fields; + SplitStringToVector(line, " \t\n\r", true, &fields); + if (fields.size() != 6) + KALDI_ERR << "Expected 6 fields in line of range file, got " + << fields.size() << " instead."; + + std::string utt = fields[0], + start_frame1_str = fields[2], + num_frames1_str = fields[3], + start_frame2_str = fields[4], + num_frames2_str = fields[5]; + + if (!ConvertStringToInteger(fields[1], &(pair->output_archive_id)) + || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1)) + || !ConvertStringToInteger(start_frame2_str, &(pair->start_frame2)) + || !ConvertStringToInteger(num_frames1_str, &(pair->num_frames1)) + || !ConvertStringToInteger(num_frames2_str, &(pair->num_frames2))) + KALDI_ERR << "Expected integer for output archive in range file."; + pair->pair_name = utt + "-" + start_frame1_str + "-" + num_frames1_str + + "-" + start_frame2_str + "-" + num_frames2_str; + unordered_map >::iterator + got = utt_to_pairs->find(utt); + if (got == utt_to_pairs->end()) { + std::vector pairs; + pairs.push_back(pair); + utt_to_pairs->insert(std::make_pair > (utt, pairs)); + } else { + got->second.push_back(pair); + } + } + } +} + +static void WriteExamples(const MatrixBase &feats, + const std::vector &pairs, + const std::string &utt, + bool compress, + int32 *num_egs_written, + std::vector *example_writers) { + for (std::vector::const_iterator it = pairs.begin(); + it != pairs.end(); ++it) { + ChunkPairInfo *pair = *it; + NnetExample eg; + int32 num_rows = feats.NumRows(), + feat_dim = feats.NumCols(); + if (num_rows < std::max(pair->num_frames1, pair->num_frames2)) { + KALDI_WARN << "Unable to create examples for utterance " << utt + << ". Requested chunk size of " + << std::max(pair->num_frames1, pair->num_frames2) + << " but utterance has only " << num_rows << " frames."; + } else { + // The requested chunk positions are approximate. It's possible + // that they slightly exceed the number of frames in the utterance. + // If that occurs, we can shift the chunks location back slightly. + int32 shift1 = std::min(0, num_rows - pair->start_frame1 + - pair->num_frames1), + shift2 = std::min(0, num_rows - pair->start_frame2 + - pair->num_frames2); + SubMatrix chunk1(feats, pair->start_frame1 + shift1, + pair->num_frames1, 0, feat_dim), + chunk2(feats, pair->start_frame2 + shift2, + pair->num_frames2, 0, feat_dim); + NnetIo nnet_io1 = NnetIo("input1", 0, chunk1), + nnet_io2 = NnetIo("input2", 0, chunk2); + for (std::vector::iterator indx_it = nnet_io1.indexes.begin(); + indx_it != nnet_io1.indexes.end(); ++indx_it) + indx_it->n = 0; + for (std::vector::iterator indx_it = nnet_io2.indexes.begin(); + indx_it != nnet_io2.indexes.end(); ++indx_it) + indx_it->n = 1; + + NnetExample eg; + eg.io.push_back(nnet_io1); + eg.io.push_back(nnet_io2); + if (compress) + eg.Compress(); + + if (pair->output_archive_id >= example_writers->size()) + KALDI_ERR << "Requested output index exceeds number of specified " + << "output files."; + (*example_writers)[pair->output_archive_id]->Write( + pair->pair_name, eg); + (*num_egs_written) += 1; + } + } +} + +// Delete the dynamically allocated memory. +static void Cleanup(unordered_map > *utt_to_pairs, + std::vector *writers) { + for (unordered_map >::iterator + map_it = utt_to_pairs->begin(); + map_it != utt_to_pairs->end(); ++map_it) + for (std::vector::iterator + vec_it = map_it->second.begin(); vec_it != map_it->second.end(); + ++vec_it) + delete *vec_it; + for (std::vector::iterator + it = writers->begin(); it != writers->end(); ++it) + delete *it; +} + +} // namespace nnet3 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + + const char *usage = + "Get examples for training an nnet3 neural network for the xvector\n" + "system. Each output example contains a pair of feature chunks from\n" + "the same utterance. The location and length of the feature chunks\n" + "are specified in the 'ranges' file. Each line is interpreted as\n" + "follows:\n" + " " + " \n" + "For example:\n" + " utt1 3 0 65 112 110\n" + " utt1 0 160 50 214 180\n" + " utt2 ...\n" + "\n" + "Usage: nnet3-xvector-get-egs [options] " + " ... \n" + "\n" + "For example:\n" + "nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark" + " ark:egs_temp.2.ark ark:egs_temp.3.ark\n"; + + bool compress = true; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs in " + "compressed format."); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + std::string + range_rspecifier = po.GetArg(1), + feature_rspecifier = po.GetArg(2); + std::vector example_writers; + + for (int32 i = 3; i <= po.NumArgs(); i++) + example_writers.push_back(new NnetExampleWriter(po.GetArg(i))); + + unordered_map > utt_to_pairs; + ProcessRangeFile(range_rspecifier, &utt_to_pairs); + SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier); + + int32 num_done = 0, + num_err = 0, + num_egs_written = 0; + + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string key = feat_reader.Key(); + const Matrix &feats = feat_reader.Value(); + unordered_map >::iterator + got = utt_to_pairs.find(key); + if (got == utt_to_pairs.end()) { + KALDI_WARN << "Could not create examples from utterance " + << key << " because it has no entry in the ranges " + << "input file."; + num_err++; + } else { + std::vector pairs = got->second; + WriteExamples(feats, pairs, key, compress, &num_egs_written, + &example_writers); + num_done++; + } + } + Cleanup(&utt_to_pairs, &example_writers); + + KALDI_LOG << "Finished generating examples, " + << "successfully processed " << num_done + << " feature files, wrote " << num_egs_written << " examples; " + << num_err << " files had errors."; + return (num_egs_written == 0 || num_err > num_done ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +}