From b11d6a82e46af756396cf6754cb31006c1955fa0 Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Wed, 18 Jul 2018 13:53:41 +0200 Subject: [PATCH] new tool fsts-concat, similar to fsts-union... - concatenates all vector-fsts with the same 'key' from the sorted 'rspecifier' in the order as they appear in I/O, - handy for concatenating HCLG graphs: 'phn-loop . some_word . phn-loop', --- src/fstbin/Makefile | 2 +- src/fstbin/fsts-concat.cc | 112 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 src/fstbin/fsts-concat.cc diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile index 39e4ae39bcc..644eb639381 100644 --- a/src/fstbin/Makefile +++ b/src/fstbin/Makefile @@ -15,7 +15,7 @@ BINFILES = fstdeterminizestar \ fstmakecontextsyms fstaddsubsequentialloop fstaddselfloops \ fstrmepslocal fstcomposecontext fsttablecompose fstrand \ fstdeterminizelog fstphicompose fstcopy \ - fstpushspecial fsts-to-transcripts fsts-project fsts-union + fstpushspecial fsts-to-transcripts fsts-project fsts-union fsts-concat OBJFILES = diff --git a/src/fstbin/fsts-concat.cc b/src/fstbin/fsts-concat.cc new file mode 100644 index 00000000000..2a217eda7dc --- /dev/null +++ b/src/fstbin/fsts-concat.cc @@ -0,0 +1,112 @@ +// fstbin/fsts-concat.cc + +// Copyright 2016 Johns Hopkins University (Authors: Jan "Yenda" Trmal) +// 2018 Soapbox Labs (Author: Karel Vesely) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-utils.h" +#include "fstext/kaldi-fst-io.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace fst; + typedef kaldi::int32 int32; + typedef kaldi::uint64 uint64; + + const char *usage = + "Reads kaldi archives with FSTs. Concatenates the fsts from all the rspecifiers.\n" + "The fsts to concatenate must have same key. The sequencing is given by the position of arguments.\n" + "\n" + "Usage: fsts-concat [options] ... \n" + " e.g.: fsts-concat scp:fsts1.scp scp:fsts2.scp ... ark:fsts_out.ark\n" + "\n" + "see also: fstconcat (from the OpenFst toolkit)\n"; + + ParseOptions po(usage); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + std::string fsts_rspecifier = po.GetArg(1), + fsts_wspecifier = po.GetArg(po.NumArgs()); + + SequentialTableReader fst_reader(fsts_rspecifier); + std::vector*> fst_readers; + TableWriter fst_writer(fsts_wspecifier); + + for (int32 i = 2; i < po.NumArgs(); i++) + fst_readers.push_back(new RandomAccessTableReader(po.GetArg(i))); + const int32 num_fst_readers = fst_readers.size(); + + int32 n_done = 0, + n_skipped = 0; + + for (; !fst_reader.Done(); fst_reader.Next()) { + std::string key = fst_reader.Key(); + + // Check that the key exists in all 'fst_readers'. + bool skip_key = false; + for (int32 i = 0; i < num_fst_readers; i++) { + if (!fst_readers[i]->HasKey(key)) { + KALDI_WARN << "Skipping '" << key << "'" + << " due to missing the fst in " << (i+2) << "th : " + << "'" << po.GetArg(i+2) << "'"; + skip_key = true; + } + } + if (skip_key) { + n_skipped++; + continue; + } + + // Concatenate! + VectorFst fst_out = fst_readers.back()->Value(key); + // Loop from (last-1) to first, as 'prepending' the fsts is faster, + // see: http://www.openfst.org/twiki/bin/view/FST/ConcatDoc + for (int32 i = num_fst_readers-2; i >= 0; i--) { + fst::Concat(fst_readers[i]->Value(key), &fst_out); + } + // Finally, prepend the fst from the 'Sequential' reader. + fst::Concat(fst_reader.Value(), &fst_out); + + // Write the output. + fst_writer.Write(key, fst_out); + n_done++; + } + + // Cleanup. + for (int32 i = 0; i < num_fst_readers; i++) + delete fst_readers[i]; + fst_readers.clear(); + + KALDI_LOG << "Produced " << n_done << " FSTs by concatenating " << po.NumArgs()-1 + << " streams " << "(" << n_skipped << " keys skipped)."; + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +}