From fab3faaa515b11a07460a9a3144cf2d37f9872a0 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Mon, 22 May 2017 13:54:05 -0400 Subject: [PATCH 01/30] scripts to train RNNLM with tensorflow --- egs/ami/s5/local/tensorflow/ptb_word_lm.py | 389 +++++++++++++++++++++ egs/ami/s5/local/tensorflow/reader.py | 128 +++++++ egs/ami/s5/local/tensorflow/run.sh | 15 + 3 files changed, 532 insertions(+) create mode 100644 egs/ami/s5/local/tensorflow/ptb_word_lm.py create mode 100644 egs/ami/s5/local/tensorflow/reader.py create mode 100755 egs/ami/s5/local/tensorflow/run.sh diff --git a/egs/ami/s5/local/tensorflow/ptb_word_lm.py b/egs/ami/s5/local/tensorflow/ptb_word_lm.py new file mode 100644 index 00000000000..e1e9673fea4 --- /dev/null +++ b/egs/ami/s5/local/tensorflow/ptb_word_lm.py @@ -0,0 +1,389 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Example / benchmark for building a PTB LSTM model. + +Trains the model described in: +(Zaremba, et. al.) Recurrent Neural Network Regularization +http://arxiv.org/abs/1409.2329 + +There are 3 supported model configurations: +=========================================== +| config | epochs | train | valid | test +=========================================== +| small | 13 | 37.99 | 121.39 | 115.91 +| medium | 39 | 48.45 | 86.16 | 82.07 +| large | 55 | 37.87 | 82.62 | 78.29 +The exact results may vary depending on the random initialization. + +The hyperparameters used in the model: +- init_scale - the initial scale of the weights +- learning_rate - the initial value of the learning rate +- max_grad_norm - the maximum permissible norm of the gradient +- num_layers - the number of LSTM layers +- num_steps - the number of unrolled steps of LSTM +- hidden_size - the number of LSTM units +- max_epoch - the number of epochs trained with the initial learning rate +- max_max_epoch - the total number of epochs for training +- keep_prob - the probability of keeping weights in the dropout layer +- lr_decay - the decay of the learning rate for each epoch after "max_epoch" +- batch_size - the batch size + +The data required for this example is in the data/ dir of the +PTB dataset from Tomas Mikolov's webpage: + +$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz +$ tar xvf simple-examples.tgz + +To run: + +$ python ptb_word_lm.py --data_path=simple-examples/data/ + +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import inspect +import time + +import sys + +sys.path.insert(0,"/home/hxu/.local/lib/python2.7/site-packages/") + +import numpy as np +import tensorflow as tf + +import reader + +flags = tf.flags +logging = tf.logging + +flags.DEFINE_string( + "model", "small", + "A type of model. Possible options are: small, medium, large.") +flags.DEFINE_string("data_path", None, + "Where the training/test data is stored.") +flags.DEFINE_string("save_path", None, + "Model output directory.") +flags.DEFINE_bool("use_fp16", False, + "Train using 16-bit floats instead of 32bit floats") + +FLAGS = flags.FLAGS + + +def data_type(): + return tf.float16 if FLAGS.use_fp16 else tf.float32 + + +class PTBInput(object): + """The input data.""" + + def __init__(self, config, data, name=None): + self.batch_size = batch_size = config.batch_size + self.num_steps = num_steps = config.num_steps + self.epoch_size = ((len(data) // batch_size) - 1) // num_steps + self.input_data, self.targets = reader.ptb_producer( + data, batch_size, num_steps, name=name) + + +class PTBModel(object): + """The PTB model.""" + + def __init__(self, is_training, config, input_): + self._input = input_ + + batch_size = input_.batch_size + num_steps = input_.num_steps + size = config.hidden_size + vocab_size = config.vocab_size + + # Slightly better results can be obtained with forget gate biases + # initialized to 1 but the hyperparameters of the model would need to be + # different than reported in the paper. + def lstm_cell(): + # With the latest TensorFlow source code (as of Mar 27, 2017), + # the BasicLSTMCell will need a reuse parameter which is unfortunately not + # defined in TensorFlow 1.0. To maintain backwards compatibility, we add + # an argument check here: + if 'reuse' in inspect.getargspec( + tf.contrib.rnn.BasicLSTMCell.__init__).args: + return tf.contrib.rnn.BasicLSTMCell( + size, forget_bias=0.0, state_is_tuple=True, + reuse=tf.get_variable_scope().reuse) + else: + return tf.contrib.rnn.BasicLSTMCell( + size, forget_bias=0.0, state_is_tuple=True) + attn_cell = lstm_cell + if is_training and config.keep_prob < 1: + def attn_cell(): + return tf.contrib.rnn.DropoutWrapper( + lstm_cell(), output_keep_prob=config.keep_prob) + cell = tf.contrib.rnn.MultiRNNCell( + [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) + + self._initial_state = cell.zero_state(batch_size, data_type()) + + with tf.device("/cpu:0"): + embedding = tf.get_variable( + "embedding", [vocab_size, size], dtype=data_type()) + inputs = tf.nn.embedding_lookup(embedding, input_.input_data) + + if is_training and config.keep_prob < 1: + inputs = tf.nn.dropout(inputs, config.keep_prob) + + # Simplified version of models/tutorials/rnn/rnn.py's rnn(). + # This builds an unrolled LSTM for tutorial purposes only. + # In general, use the rnn() or state_saving_rnn() from rnn.py. + # + # The alternative version of the code below is: + # + # inputs = tf.unstack(inputs, num=num_steps, axis=1) + # outputs, state = tf.contrib.rnn.static_rnn( + # cell, inputs, initial_state=self._initial_state) + outputs = [] + state = self._initial_state + with tf.variable_scope("RNN"): + for time_step in range(num_steps): + if time_step > 0: tf.get_variable_scope().reuse_variables() + (cell_output, state) = cell(inputs[:, time_step, :], state) + outputs.append(cell_output) + + output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) + softmax_w = tf.get_variable( + "softmax_w", [size, vocab_size], dtype=data_type()) + softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) + logits = tf.matmul(output, softmax_w) + softmax_b + loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( + [logits], + [tf.reshape(input_.targets, [-1])], + [tf.ones([batch_size * num_steps], dtype=data_type())]) + self._cost = cost = tf.reduce_sum(loss) / batch_size + self._final_state = state + + if not is_training: + return + + self._lr = tf.Variable(0.0, trainable=False) + tvars = tf.trainable_variables() + grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), + config.max_grad_norm) + optimizer = tf.train.GradientDescentOptimizer(self._lr) + self._train_op = optimizer.apply_gradients( + zip(grads, tvars), + global_step=tf.contrib.framework.get_or_create_global_step()) + + self._new_lr = tf.placeholder( + tf.float32, shape=[], name="new_learning_rate") + self._lr_update = tf.assign(self._lr, self._new_lr) + + def assign_lr(self, session, lr_value): + session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) + + @property + def input(self): + return self._input + + @property + def initial_state(self): + return self._initial_state + + @property + def cost(self): + return self._cost + + @property + def final_state(self): + return self._final_state + + @property + def lr(self): + return self._lr + + @property + def train_op(self): + return self._train_op + + +class SmallConfig(object): + """Small config.""" + init_scale = 0.1 + learning_rate = 1.0 + max_grad_norm = 5 + num_layers = 2 + num_steps = 20 + hidden_size = 200 + max_epoch = 4 + max_max_epoch = 13 + keep_prob = 1.0 + lr_decay = 0.5 + batch_size = 20 + vocab_size = 10000 + + +class MediumConfig(object): + """Medium config.""" + init_scale = 0.05 + learning_rate = 1.0 + max_grad_norm = 5 + num_layers = 2 + num_steps = 35 + hidden_size = 650 + max_epoch = 6 + max_max_epoch = 39 + keep_prob = 0.5 + lr_decay = 0.8 + batch_size = 20 + vocab_size = 10000 + + +class LargeConfig(object): + """Large config.""" + init_scale = 0.04 + learning_rate = 1.0 + max_grad_norm = 10 + num_layers = 2 + num_steps = 35 + hidden_size = 1500 + max_epoch = 14 + max_max_epoch = 55 + keep_prob = 0.35 + lr_decay = 1 / 1.15 + batch_size = 20 + vocab_size = 10000 + + +class TestConfig(object): + """Tiny config, for testing.""" + init_scale = 0.1 + learning_rate = 1.0 + max_grad_norm = 1 + num_layers = 1 + num_steps = 2 + hidden_size = 2 + max_epoch = 1 + max_max_epoch = 1 + keep_prob = 1.0 + lr_decay = 0.5 + batch_size = 20 + vocab_size = 10000 + + +def run_epoch(session, model, eval_op=None, verbose=False): + """Runs the model on the given data.""" + start_time = time.time() + costs = 0.0 + iters = 0 + state = session.run(model.initial_state) + + fetches = { + "cost": model.cost, + "final_state": model.final_state, + } + if eval_op is not None: + fetches["eval_op"] = eval_op + + for step in range(model.input.epoch_size): + feed_dict = {} + for i, (c, h) in enumerate(model.initial_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + + vals = session.run(fetches, feed_dict) + cost = vals["cost"] + state = vals["final_state"] + + costs += cost + iters += model.input.num_steps + + if verbose and step % (model.input.epoch_size // 10) == 10: + print("%.3f perplexity: %.3f speed: %.0f wps" % + (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), + iters * model.input.batch_size / (time.time() - start_time))) + + return np.exp(costs / iters) + + +def get_config(): + if FLAGS.model == "small": + return SmallConfig() + elif FLAGS.model == "medium": + return MediumConfig() + elif FLAGS.model == "large": + return LargeConfig() + elif FLAGS.model == "test": + return TestConfig() + else: + raise ValueError("Invalid model: %s", FLAGS.model) + + +def main(_): + if not FLAGS.data_path: + raise ValueError("Must set --data_path to PTB data directory") + + raw_data = reader.ptb_raw_data(FLAGS.data_path) + train_data, valid_data, test_data, _ = raw_data + + config = get_config() + eval_config = get_config() + eval_config.batch_size = 1 + eval_config.num_steps = 1 + + with tf.Graph().as_default(): + initializer = tf.random_uniform_initializer(-config.init_scale, + config.init_scale) + + with tf.name_scope("Train"): + train_input = PTBInput(config=config, data=train_data, name="TrainInput") + with tf.variable_scope("Model", reuse=None, initializer=initializer): + m = PTBModel(is_training=True, config=config, input_=train_input) + tf.summary.scalar("Training Loss", m.cost) + tf.summary.scalar("Learning Rate", m.lr) + + with tf.name_scope("Valid"): + valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") + with tf.variable_scope("Model", reuse=True, initializer=initializer): + mvalid = PTBModel(is_training=False, config=config, input_=valid_input) + tf.summary.scalar("Validation Loss", mvalid.cost) + + with tf.name_scope("Test"): + test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") + with tf.variable_scope("Model", reuse=True, initializer=initializer): + mtest = PTBModel(is_training=False, config=eval_config, + input_=test_input) + + sv = tf.train.Supervisor(logdir=FLAGS.save_path) + with sv.managed_session() as session: + for i in range(config.max_max_epoch): + lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) + m.assign_lr(session, config.learning_rate * lr_decay) + + print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) + train_perplexity = run_epoch(session, m, eval_op=m.train_op, + verbose=True) + print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) + valid_perplexity = run_epoch(session, mvalid) + print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) + + test_perplexity = run_epoch(session, mtest) + print("Test Perplexity: %.3f" % test_perplexity) + + if FLAGS.save_path: + print("Saving model to %s." % FLAGS.save_path) + sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) + + +if __name__ == "__main__": + tf.app.run() diff --git a/egs/ami/s5/local/tensorflow/reader.py b/egs/ami/s5/local/tensorflow/reader.py new file mode 100644 index 00000000000..f60bb0d636b --- /dev/null +++ b/egs/ami/s5/local/tensorflow/reader.py @@ -0,0 +1,128 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +"""Utilities for parsing PTB text files.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os + +import tensorflow as tf + + + +def _read_words(filename): + with tf.gfile.GFile(filename, "r") as f: + return f.read().decode("utf-8").replace("\n", "").split() + + +def _build_vocab(filename): + data = _read_words(filename) + + counter = collections.Counter(data) + count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) + + words, _ = list(zip(*count_pairs)) + word_to_id = dict(zip(words, range(len(words)))) + +# +# print(word_to_id) +# +# print("") + + return word_to_id + + +def _file_to_word_ids(filename, word_to_id): + data = _read_words(filename) + return [word_to_id[word] for word in data if word in word_to_id] + + +def ptb_raw_data(data_path=None): + """Load PTB raw data from data directory "data_path". + + Reads PTB text files, converts strings to integer ids, + and performs mini-batching of the inputs. + + The PTB dataset comes from Tomas Mikolov's webpage: + + http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz + + Args: + data_path: string path to the directory where simple-examples.tgz has + been extracted. + + Returns: + tuple (train_data, valid_data, test_data, vocabulary) + where each of the data objects can be passed to PTBIterator. + """ + + train_path = os.path.join(data_path, "train.txt") + valid_path = os.path.join(data_path, "dev.txt") + test_path = os.path.join(data_path, "eval.txt") + + word_to_id = _build_vocab(train_path) + train_data = _file_to_word_ids(train_path, word_to_id) + valid_data = _file_to_word_ids(valid_path, word_to_id) + test_data = _file_to_word_ids(test_path, word_to_id) + vocabulary = len(word_to_id) + return train_data, valid_data, test_data, vocabulary + + +def ptb_producer(raw_data, batch_size, num_steps, name=None): + """Iterate on the raw PTB data. + + This chunks up raw_data into batches of examples and returns Tensors that + are drawn from these batches. + + Args: + raw_data: one of the raw data outputs from ptb_raw_data. + batch_size: int, the batch size. + num_steps: int, the number of unrolls. + name: the name of this operation (optional). + + Returns: + A pair of Tensors, each shaped [batch_size, num_steps]. The second element + of the tuple is the same data time-shifted to the right by one. + + Raises: + tf.errors.InvalidArgumentError: if batch_size or num_steps are too high. + """ + with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]): + raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32) + + data_len = tf.size(raw_data) + batch_len = data_len // batch_size + data = tf.reshape(raw_data[0 : batch_size * batch_len], + [batch_size, batch_len]) + + epoch_size = (batch_len - 1) // num_steps + assertion = tf.assert_positive( + epoch_size, + message="epoch_size == 0, decrease batch_size or num_steps") + with tf.control_dependencies([assertion]): + epoch_size = tf.identity(epoch_size, name="epoch_size") + + i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() + x = tf.strided_slice(data, [0, i * num_steps], + [batch_size, (i + 1) * num_steps]) + x.set_shape([batch_size, num_steps]) + y = tf.strided_slice(data, [0, i * num_steps + 1], + [batch_size, (i + 1) * num_steps + 1]) + y.set_shape([batch_size, num_steps]) + return x, y diff --git a/egs/ami/s5/local/tensorflow/run.sh b/egs/ami/s5/local/tensorflow/run.sh new file mode 100755 index 00000000000..7e868452989 --- /dev/null +++ b/egs/ami/s5/local/tensorflow/run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +data_type=sdm1 +model_type=small + +dir=data/tensorflow/ +mkdir -p $dir + +cat data/$data_type/train/text | awk '{for(i=2;i<=NF;i++)print $i}' | sort | uniq -c | sort -k1nr | head -n 9998 | awk '{print $2}' > $dir/wordlist + +for i in train dev eval; do + cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" > $dir/$i.txt +done + +python local/tensorflow/ptb_word_lm.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm.mdl From cae43c6b57e9582844ff3121f2fdbd20cabc86f5 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Tue, 23 May 2017 18:31:54 -0400 Subject: [PATCH 02/30] successfully did python training plus c++ eval --- egs/ami/s5/local/tensorflow/load.py | 22 +++++++ egs/ami/s5/local/tensorflow/loader.cc | 88 +++++++++++++++++++++++++++ egs/ami/s5/local/tensorflow/reader.py | 5 -- 3 files changed, 110 insertions(+), 5 deletions(-) create mode 100644 egs/ami/s5/local/tensorflow/load.py create mode 100644 egs/ami/s5/local/tensorflow/loader.cc diff --git a/egs/ami/s5/local/tensorflow/load.py b/egs/ami/s5/local/tensorflow/load.py new file mode 100644 index 00000000000..0d0959aa746 --- /dev/null +++ b/egs/ami/s5/local/tensorflow/load.py @@ -0,0 +1,22 @@ +import sys + +sys.path.insert(0,"/home/hxu/.local/lib/python2.7/site-packages/") + +import tensorflow as tf +import numpy as np +#config = tf.ConfigProto(device_count = {'GPU': 0} ) + +#with tf.Session(config=config) as sess: +with tf.Session() as sess: + a = tf.Variable(5.5, name='a') + b = tf.Variable(6.6, name='b') + c = tf.multiply(a, b, name="c") + + sess.run(tf.global_variables_initializer()) + + print a.eval() # 5.0 + print b.eval() # 6.0 + print c.eval() # 30.0 + + tf.train.write_graph(sess.graph_def, 'models/', 'graph.pb', as_text=False) + diff --git a/egs/ami/s5/local/tensorflow/loader.cc b/egs/ami/s5/local/tensorflow/loader.cc new file mode 100644 index 00000000000..b02b1f4b853 --- /dev/null +++ b/egs/ami/s5/local/tensorflow/loader.cc @@ -0,0 +1,88 @@ +#include "tensorflow/core/public/session.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/protobuf/meta_graph.pb.h" + + +using namespace tensorflow; + +int main(int argc, char* argv[]) { + // Initialize a tensorflow session + Session* session; + Status status = NewSession(SessionOptions(), &session); + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + const string pathToGraph = "/export/b02/hxu/TensorFlow/save_load/models/m.meta"; + const string checkpointPath = "/export/b02/hxu/TensorFlow/save_load/models/m"; + + // Read in the protobuf graph we exported + // (The path seems to be relative to the cwd. Keep this in mind + // when using `bazel run` since the cwd isn't where you call + // `bazel run` but from inside a temp folder.) + MetaGraphDef graph_def; + status = ReadBinaryProto(Env::Default(), pathToGraph, &graph_def); + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + // Add the graph to the session + status = session->Create(graph_def.graph_def()); + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + Tensor checkpointPathTensor(DT_STRING, TensorShape()); + checkpointPathTensor.scalar()() = checkpointPath; + + status = session->Run( + {{ graph_def.saver_def().filename_tensor_name(), checkpointPathTensor },}, + {}, + {graph_def.saver_def().restore_op_name()}, + nullptr); + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + // Setup inputs and outputs: + + Tensor a(DT_FLOAT, TensorShape()); + a.scalar()() = 5.5; + + Tensor b(DT_FLOAT, TensorShape()); + b.scalar()() = 6.6; + + std::vector> inputs = { + { "a", a }, + { "b", b }, + }; + + // The session will initialize the outputs + std::vector outputs; + + // Run the session, evaluating our "c" operation from the graph + status = session->Run(inputs, {"output"}, {}, &outputs); + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + // Grab the first output (we only evaluated one graph node: "c") + // and convert the node to a scalar representation. + auto output_c = outputs[0].scalar(); + + // (There are similar methods for vectors and matrices here: + // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/tensor.h) + + // Print the results + std::cout << outputs[0].DebugString() << "\n"; // Tensor + std::cout << output_c() << "\n"; // 30 + + // Free any resources used by the session + session->Close(); + return 0; +} diff --git a/egs/ami/s5/local/tensorflow/reader.py b/egs/ami/s5/local/tensorflow/reader.py index f60bb0d636b..5ec03b19b51 100644 --- a/egs/ami/s5/local/tensorflow/reader.py +++ b/egs/ami/s5/local/tensorflow/reader.py @@ -40,11 +40,6 @@ def _build_vocab(filename): words, _ = list(zip(*count_pairs)) word_to_id = dict(zip(words, range(len(words)))) -# -# print(word_to_id) -# -# print("") - return word_to_id From 1302854e86859f6cf19189d32dfe99fe51d6eb03 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Fri, 2 Jun 2017 20:03:33 -0400 Subject: [PATCH 03/30] bigger graph idea is working --- egs/ami/s5/local/tensorflow/ptb_word_lm.py | 51 +++++++++++++++------- egs/ami/s5/local/tensorflow/run.sh | 14 +++--- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/egs/ami/s5/local/tensorflow/ptb_word_lm.py b/egs/ami/s5/local/tensorflow/ptb_word_lm.py index e1e9673fea4..1c48632f8d1 100644 --- a/egs/ami/s5/local/tensorflow/ptb_word_lm.py +++ b/egs/ami/s5/local/tensorflow/ptb_word_lm.py @@ -14,11 +14,9 @@ # ============================================================================== """Example / benchmark for building a PTB LSTM model. - Trains the model described in: (Zaremba, et. al.) Recurrent Neural Network Regularization http://arxiv.org/abs/1409.2329 - There are 3 supported model configurations: =========================================== | config | epochs | train | valid | test @@ -27,7 +25,6 @@ | medium | 39 | 48.45 | 86.16 | 82.07 | large | 55 | 37.87 | 82.62 | 78.29 The exact results may vary depending on the random initialization. - The hyperparameters used in the model: - init_scale - the initial scale of the weights - learning_rate - the initial value of the learning rate @@ -40,29 +37,24 @@ - keep_prob - the probability of keeping weights in the dropout layer - lr_decay - the decay of the learning rate for each epoch after "max_epoch" - batch_size - the batch size - The data required for this example is in the data/ dir of the PTB dataset from Tomas Mikolov's webpage: - $ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz $ tar xvf simple-examples.tgz - To run: - $ python ptb_word_lm.py --data_path=simple-examples/data/ - """ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import inspect -import time - import sys sys.path.insert(0,"/home/hxu/.local/lib/python2.7/site-packages/") +import inspect +import time + import numpy as np import tensorflow as tf @@ -136,10 +128,42 @@ def attn_cell(): self._initial_state = cell.zero_state(batch_size, data_type()) + + # first implement the less efficient version + test_word_in = tf.placeholder(tf.int32, [1, 1]) + test_word_out = tf.placeholder(tf.int32, [1, 1]) + test_input_state_c = tf.placeholder(tf.float32, [1, size]) + test_input_state_h = tf.placeholder(tf.float32, [1, size]) + test_input_state = tf.contrib.rnn.LSTMStateTuple(test_input_state_c, test_input_state_h) + +# print ("want to be", self._initial_state) +# print ("it actually is ", input_state) with tf.device("/cpu:0"): embedding = tf.get_variable( "embedding", [vocab_size, size], dtype=data_type()) + +# print("should be ", input_.input_data) +# print("is ", test_word) + inputs = tf.nn.embedding_lookup(embedding, input_.input_data) + test_inputs = tf.nn.embedding_lookup(embedding, test_word_in) +# print("should be ", inputs) +# print("is ", test_inputs) + + # test time + with tf.variable_scope("RNN"): +# tf.get_variable_scope().reuse_variables() + (test_cell_output, test_output_state) = cell(test_inputs[:, 0, :], [test_input_state]) + + softmax_w = tf.get_variable( + "softmax_w", [size, vocab_size], dtype=data_type()) + softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) + + test_logits = tf.matmul(test_cell_output, softmax_w) + softmax_b + test_softmaxed = tf.nn.softmax(test_logits) + print("test softmaxed is ", test_softmaxed) + p_word = test_softmaxed[0, test_word_out[0,0]] +# p_word = tf.float32(test_softmaxed[:, test_word_out], name="p_out") if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) @@ -157,14 +181,11 @@ def attn_cell(): state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): - if time_step > 0: tf.get_variable_scope().reuse_variables() + if time_step > -1: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) - softmax_w = tf.get_variable( - "softmax_w", [size, vocab_size], dtype=data_type()) - softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], diff --git a/egs/ami/s5/local/tensorflow/run.sh b/egs/ami/s5/local/tensorflow/run.sh index 7e868452989..e869f68873d 100755 --- a/egs/ami/s5/local/tensorflow/run.sh +++ b/egs/ami/s5/local/tensorflow/run.sh @@ -1,15 +1,15 @@ #!/bin/bash data_type=sdm1 -model_type=small +model_type=test dir=data/tensorflow/ mkdir -p $dir -cat data/$data_type/train/text | awk '{for(i=2;i<=NF;i++)print $i}' | sort | uniq -c | sort -k1nr | head -n 9998 | awk '{print $2}' > $dir/wordlist +#cat data/$data_type/train/text | awk '{for(i=2;i<=NF;i++)print $i}' | sort | uniq -c | sort -k1nr | head -n 9998 | awk '{print $2}' > $dir/wordlist +# +#for i in train dev eval; do +# cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" > $dir/$i.txt +#done -for i in train dev eval; do - cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" > $dir/$i.txt -done - -python local/tensorflow/ptb_word_lm.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm.mdl +python local/tensorflow/ptb_word_lm.py --data_path=$dir --model=$model_type --save_path=$dir/model From e148dbbc3edfb32711d13d0e24b653942624b8e1 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Mon, 5 Jun 2017 18:02:05 -0400 Subject: [PATCH 04/30] an initial working version of lstm LM that is accessible from C++ --- egs/ami/s5/local/tensorflow/rnnlm.py | 417 +++++++++++++++++++++++++++ egs/ami/s5/local/tensorflow/run.sh | 3 +- 2 files changed, 419 insertions(+), 1 deletion(-) create mode 100644 egs/ami/s5/local/tensorflow/rnnlm.py diff --git a/egs/ami/s5/local/tensorflow/rnnlm.py b/egs/ami/s5/local/tensorflow/rnnlm.py new file mode 100644 index 00000000000..b3870cc0919 --- /dev/null +++ b/egs/ami/s5/local/tensorflow/rnnlm.py @@ -0,0 +1,417 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Example / benchmark for building a PTB LSTM model. +Trains the model described in: +(Zaremba, et. al.) Recurrent Neural Network Regularization +http://arxiv.org/abs/1409.2329 +There are 3 supported model configurations: +=========================================== +| config | epochs | train | valid | test +=========================================== +| small | 13 | 37.99 | 121.39 | 115.91 +| medium | 39 | 48.45 | 86.16 | 82.07 +| large | 55 | 37.87 | 82.62 | 78.29 +The exact results may vary depending on the random initialization. +The hyperparameters used in the model: +- init_scale - the initial scale of the weights +- learning_rate - the initial value of the learning rate +- max_grad_norm - the maximum permissible norm of the gradient +- num_layers - the number of LSTM layers +- num_steps - the number of unrolled steps of LSTM +- hidden_size - the number of LSTM units +- max_epoch - the number of epochs trained with the initial learning rate +- max_max_epoch - the total number of epochs for training +- keep_prob - the probability of keeping weights in the dropout layer +- lr_decay - the decay of the learning rate for each epoch after "max_epoch" +- batch_size - the batch size +The data required for this example is in the data/ dir of the +PTB dataset from Tomas Mikolov's webpage: +$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz +$ tar xvf simple-examples.tgz +To run: +$ python ptb_word_lm.py --data_path=simple-examples/data/ +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +sys.path.insert(0,"/home/hxu/.local/lib/python2.7/site-packages/") + +import inspect +import time + +import numpy as np +import tensorflow as tf + +import reader + +flags = tf.flags +logging = tf.logging + +flags.DEFINE_string( + "model", "small", + "A type of model. Possible options are: small, medium, large.") +flags.DEFINE_string("data_path", None, + "Where the training/test data is stored.") +flags.DEFINE_string("save_path", None, + "Model output directory.") +flags.DEFINE_bool("use_fp16", False, + "Train using 16-bit floats instead of 32bit floats") + +FLAGS = flags.FLAGS + + +def data_type(): + return tf.float16 if FLAGS.use_fp16 else tf.float32 + + +class PTBInput(object): + """The input data.""" + + def __init__(self, config, data, name=None): + self.batch_size = batch_size = config.batch_size + self.num_steps = num_steps = config.num_steps + self.epoch_size = ((len(data) // batch_size) - 1) // num_steps + self.input_data, self.targets = reader.ptb_producer( + data, batch_size, num_steps, name=name) + + +class PTBModel(object): + """The PTB model.""" + + def __init__(self, is_training, config, input_): + self._input = input_ + + batch_size = input_.batch_size + num_steps = input_.num_steps + size = config.hidden_size + vocab_size = config.vocab_size + + # Slightly better results can be obtained with forget gate biases + # initialized to 1 but the hyperparameters of the model would need to be + # different than reported in the paper. + def lstm_cell(): + # With the latest TensorFlow source code (as of Mar 27, 2017), + # the BasicLSTMCell will need a reuse parameter which is unfortunately not + # defined in TensorFlow 1.0. To maintain backwards compatibility, we add + # an argument check here: + if 'reuse' in inspect.getargspec( + tf.contrib.rnn.BasicLSTMCell.__init__).args: + return tf.contrib.rnn.BasicLSTMCell( + size, forget_bias=0.0, state_is_tuple=True, + reuse=tf.get_variable_scope().reuse) + else: + return tf.contrib.rnn.BasicLSTMCell( + size, forget_bias=0.0, state_is_tuple=True) + attn_cell = lstm_cell + if is_training and config.keep_prob < 1: + def attn_cell(): + return tf.contrib.rnn.DropoutWrapper( + lstm_cell(), output_keep_prob=config.keep_prob) + self.cell = tf.contrib.rnn.MultiRNNCell( + [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) + + self._initial_state = self.cell.zero_state(batch_size, data_type()) + + + # first implement the less efficient version + test_word_in = tf.placeholder(tf.int32, [1, 1], name="test_word_in") + test_word_out = tf.placeholder(tf.int32, [1, 1], name="test_word_out") + test_input_state_c = tf.placeholder(tf.float32, [1, size], name="test_state_c") + test_input_state_h = tf.placeholder(tf.float32, [1, size], name="test_state_h") + test_input_state = tf.contrib.rnn.LSTMStateTuple(test_input_state_c, test_input_state_h) + +# print ("want to be", self._initial_state) +# print ("it actually is ", input_state) + with tf.device("/cpu:0"): + self.embedding = tf.get_variable( + "embedding", [vocab_size, size], dtype=data_type()) + +# print("should be ", input_.input_data) +# print("is ", test_word) + + inputs = tf.nn.embedding_lookup(self.embedding, input_.input_data) + test_inputs = tf.nn.embedding_lookup(self.embedding, test_word_in) +# print("should be ", inputs) +# print("is ", test_inputs) + + # test time + with tf.variable_scope("RNN"): +# tf.get_variable_scope().reuse_variables() + (test_cell_output, test_output_state) = self.cell(test_inputs[:, 0, :], [test_input_state]) + + softmax_w = tf.get_variable( + "softmax_w", [size, vocab_size], dtype=data_type()) + softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) + + test_logits = tf.matmul(test_cell_output, softmax_w) + softmax_b + test_softmaxed = tf.nn.softmax(test_logits) + print("test softmaxed is ", test_softmaxed) + p_word = test_softmaxed[0, test_word_out[0,0]] + test_out = tf.identity(p_word, name="test_out") +# p_word = tf.float32(test_softmaxed[:, test_word_out], name="p_out") + + if is_training and config.keep_prob < 1: + inputs = tf.nn.dropout(inputs, config.keep_prob) + + # Simplified version of models/tutorials/rnn/rnn.py's rnn(). + # This builds an unrolled LSTM for tutorial purposes only. + # In general, use the rnn() or state_saving_rnn() from rnn.py. + # + # The alternative version of the code below is: + # + # inputs = tf.unstack(inputs, num=num_steps, axis=1) + # outputs, state = tf.contrib.rnn.static_rnn( + # cell, inputs, initial_state=self._initial_state) + outputs = [] + state = self._initial_state + with tf.variable_scope("RNN"): + for time_step in range(num_steps): + if time_step > -1: tf.get_variable_scope().reuse_variables() + (cell_output, state) = self.cell(inputs[:, time_step, :], state) + outputs.append(cell_output) + + output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) + logits = tf.matmul(output, softmax_w) + softmax_b + loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( + [logits], + [tf.reshape(input_.targets, [-1])], + [tf.ones([batch_size * num_steps], dtype=data_type())]) + self._cost = cost = tf.reduce_sum(loss) / batch_size + self._final_state = state + + if not is_training: + return + + self._lr = tf.Variable(0.0, trainable=False) + tvars = tf.trainable_variables() + grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), + config.max_grad_norm) + optimizer = tf.train.GradientDescentOptimizer(self._lr) + self._train_op = optimizer.apply_gradients( + zip(grads, tvars), + global_step=tf.contrib.framework.get_or_create_global_step()) + + self._new_lr = tf.placeholder( + tf.float32, shape=[], name="new_learning_rate") + self._lr_update = tf.assign(self._lr, self._new_lr) + + def assign_lr(self, session, lr_value): + session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) + + @property + def input(self): + return self._input + + @property + def initial_state(self): + return self._initial_state + + @property + def cost(self): + return self._cost + + @property + def final_state(self): + return self._final_state + + @property + def lr(self): + return self._lr + + @property + def train_op(self): + return self._train_op + + +class SmallConfig(object): + """Small config.""" + init_scale = 0.1 + learning_rate = 1.0 + max_grad_norm = 5 + num_layers = 2 + num_steps = 20 + hidden_size = 200 + max_epoch = 4 + max_max_epoch = 13 + keep_prob = 1.0 + lr_decay = 0.5 + batch_size = 20 + vocab_size = 10000 + + +class MediumConfig(object): + """Medium config.""" + init_scale = 0.05 + learning_rate = 1.0 + max_grad_norm = 5 + num_layers = 2 + num_steps = 35 + hidden_size = 650 + max_epoch = 6 + max_max_epoch = 39 + keep_prob = 0.5 + lr_decay = 0.8 + batch_size = 20 + vocab_size = 10000 + + +class LargeConfig(object): + """Large config.""" + init_scale = 0.04 + learning_rate = 1.0 + max_grad_norm = 10 + num_layers = 2 + num_steps = 35 + hidden_size = 1500 + max_epoch = 14 + max_max_epoch = 55 + keep_prob = 0.35 + lr_decay = 1 / 1.15 + batch_size = 20 + vocab_size = 10000 + + +class TestConfig(object): + """Tiny config, for testing.""" + init_scale = 0.1 + learning_rate = 1.0 + max_grad_norm = 1 + num_layers = 1 + num_steps = 2 + hidden_size = 2 + max_epoch = 1 + max_max_epoch = 1 + keep_prob = 1.0 + lr_decay = 0.5 + batch_size = 20 + vocab_size = 10000 + + +def run_epoch(session, model, eval_op=None, verbose=False): + """Runs the model on the given data.""" + start_time = time.time() + costs = 0.0 + iters = 0 + state = session.run(model.initial_state) + + fetches = { + "cost": model.cost, + "final_state": model.final_state, + } + if eval_op is not None: + fetches["eval_op"] = eval_op + + for step in range(model.input.epoch_size): + feed_dict = {} + for i, (c, h) in enumerate(model.initial_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + + vals = session.run(fetches, feed_dict) + cost = vals["cost"] + state = vals["final_state"] + + costs += cost + iters += model.input.num_steps + + if verbose and step % (model.input.epoch_size // 10) == 10: + print("%.3f perplexity: %.3f speed: %.0f wps" % + (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), + iters * model.input.batch_size / (time.time() - start_time))) + + return np.exp(costs / iters) + + +def get_config(): + if FLAGS.model == "small": + return SmallConfig() + elif FLAGS.model == "medium": + return MediumConfig() + elif FLAGS.model == "large": + return LargeConfig() + elif FLAGS.model == "test": + return TestConfig() + else: + raise ValueError("Invalid model: %s", FLAGS.model) + + +def main(_): + if not FLAGS.data_path: + raise ValueError("Must set --data_path to PTB data directory") + + raw_data = reader.ptb_raw_data(FLAGS.data_path) + train_data, valid_data, test_data, _ = raw_data + + config = get_config() + eval_config = get_config() + eval_config.batch_size = 1 + eval_config.num_steps = 1 + + with tf.Graph().as_default(): + initializer = tf.random_uniform_initializer(-config.init_scale, + config.init_scale) + + with tf.name_scope("Train"): + train_input = PTBInput(config=config, data=train_data, name="TrainInput") + with tf.variable_scope("Model", reuse=None, initializer=initializer): + m = PTBModel(is_training=True, config=config, input_=train_input) + tf.summary.scalar("Training Loss", m.cost) + tf.summary.scalar("Learning Rate", m.lr) + +# with tf.name_scope("Valid"): +# valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") +# with tf.variable_scope("Model", reuse=True, initializer=initializer): +# mvalid = PTBModel(is_training=False, config=config, input_=valid_input) +# tf.summary.scalar("Validation Loss", mvalid.cost) +# +# with tf.name_scope("Test"): +# test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") +# with tf.variable_scope("Model", reuse=True, initializer=initializer): +# mtest = PTBModel(is_training=False, config=eval_config, +# input_=test_input) + +# saver = tf.train.Saver({"embedding": m.embedding}) +# saver = tf.train.Saver({"embedding": m.embedding, "lstm": m.cell}) + sv = tf.train.Supervisor(logdir=FLAGS.save_path) + with sv.managed_session() as session: + for i in range(config.max_max_epoch): + lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) + m.assign_lr(session, config.learning_rate * lr_decay) + + print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) + train_perplexity = run_epoch(session, m, eval_op=m.train_op, + verbose=True) + +# print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) +# valid_perplexity = run_epoch(session, mvalid) +# print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) + +# test_perplexity = run_epoch(session, mtest) +# print("Test Perplexity: %.3f" % test_perplexity) + + if FLAGS.save_path: +# saver = tf.train.Saver() + print("Saving model to %s." % FLAGS.save_path) +# saver.save(session, FLAGS.save_path, global_step=sv.global_step) + sv.saver.save(session, FLAGS.save_path) +# sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) + + +if __name__ == "__main__": + tf.app.run() diff --git a/egs/ami/s5/local/tensorflow/run.sh b/egs/ami/s5/local/tensorflow/run.sh index e869f68873d..22947616967 100755 --- a/egs/ami/s5/local/tensorflow/run.sh +++ b/egs/ami/s5/local/tensorflow/run.sh @@ -12,4 +12,5 @@ mkdir -p $dir # cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" > $dir/$i.txt #done -python local/tensorflow/ptb_word_lm.py --data_path=$dir --model=$model_type --save_path=$dir/model +#python local/tensorflow/ptb_word_lm.py --data_path=$dir --model=$model_type --save_path=$dir/model +python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/model From 56d9c890e7540eae4b9abf59eeec2c5657bf83b8 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Mon, 5 Jun 2017 20:16:14 -0400 Subject: [PATCH 05/30] supports multilayer LSTM now --- egs/ami/s5/local/tensorflow/rnnlm.py | 58 ++++++++++++++++------------ egs/ami/s5/local/tensorflow/run.sh | 13 +++---- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/egs/ami/s5/local/tensorflow/rnnlm.py b/egs/ami/s5/local/tensorflow/rnnlm.py index b3870cc0919..7fff2c7d2b4 100644 --- a/egs/ami/s5/local/tensorflow/rnnlm.py +++ b/egs/ami/s5/local/tensorflow/rnnlm.py @@ -127,14 +127,24 @@ def attn_cell(): [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) self._initial_state = self.cell.zero_state(batch_size, data_type()) + self._initial_state_single = self.cell.zero_state(1, data_type()) + + self.initial = tf.reshape(tf.stack(axis=0, values=self._initial_state_single), [config.num_layers, 2, 1, size], name="test_initial_state") # first implement the less efficient version test_word_in = tf.placeholder(tf.int32, [1, 1], name="test_word_in") test_word_out = tf.placeholder(tf.int32, [1, 1], name="test_word_out") - test_input_state_c = tf.placeholder(tf.float32, [1, size], name="test_state_c") - test_input_state_h = tf.placeholder(tf.float32, [1, size], name="test_state_h") - test_input_state = tf.contrib.rnn.LSTMStateTuple(test_input_state_c, test_input_state_h) +# test_input_state_c = tf.placeholder(tf.float32, [1, size], name="test_state_c") +# test_input_state_h = tf.placeholder(tf.float32, [1, size], name="test_state_h") + state_placeholder = tf.placeholder(tf.float32, [config.num_layers, 2, 1, size], name="test_state") + l = tf.unstack(state_placeholder, axis=0) + test_input_state = tuple( + [tf.contrib.rnn.LSTMStateTuple(l[idx][0],l[idx][1]) + for idx in range(config.num_layers)] + ) + +# test_input_state = tf.contrib.rnn.LSTMStateTuple(test_input_state_c, test_input_state_h) # print ("want to be", self._initial_state) # print ("it actually is ", input_state) @@ -153,15 +163,16 @@ def attn_cell(): # test time with tf.variable_scope("RNN"): # tf.get_variable_scope().reuse_variables() - (test_cell_output, test_output_state) = self.cell(test_inputs[:, 0, :], [test_input_state]) + (test_cell_output, test_output_state) = self.cell(test_inputs[:, 0, :], test_input_state) + test_out_state = tf.reshape(tf.stack(axis=1, values=test_output_state), [config.num_layers, 2, 1, size], name="test_state_out") softmax_w = tf.get_variable( "softmax_w", [size, vocab_size], dtype=data_type()) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) test_logits = tf.matmul(test_cell_output, softmax_w) + softmax_b test_softmaxed = tf.nn.softmax(test_logits) - print("test softmaxed is ", test_softmaxed) + p_word = test_softmaxed[0, test_word_out[0,0]] test_out = tf.identity(p_word, name="test_out") # p_word = tf.float32(test_softmaxed[:, test_word_out], name="p_out") @@ -247,8 +258,8 @@ class SmallConfig(object): num_layers = 2 num_steps = 20 hidden_size = 200 - max_epoch = 4 - max_max_epoch = 13 + max_epoch = 1 #4 + max_max_epoch = 1 #13 keep_prob = 1.0 lr_decay = 0.5 batch_size = 20 @@ -374,20 +385,19 @@ def main(_): tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) -# with tf.name_scope("Valid"): -# valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") -# with tf.variable_scope("Model", reuse=True, initializer=initializer): -# mvalid = PTBModel(is_training=False, config=config, input_=valid_input) -# tf.summary.scalar("Validation Loss", mvalid.cost) -# -# with tf.name_scope("Test"): -# test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") -# with tf.variable_scope("Model", reuse=True, initializer=initializer): -# mtest = PTBModel(is_training=False, config=eval_config, -# input_=test_input) + with tf.name_scope("Valid"): + valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") + with tf.variable_scope("Model", reuse=True, initializer=initializer): + mvalid = PTBModel(is_training=False, config=config, input_=valid_input) + tf.summary.scalar("Validation Loss", mvalid.cost) + + with tf.name_scope("Test"): + test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") + with tf.variable_scope("Model", reuse=True, initializer=initializer): + mtest = PTBModel(is_training=False, config=eval_config, + input_=test_input) # saver = tf.train.Saver({"embedding": m.embedding}) -# saver = tf.train.Saver({"embedding": m.embedding, "lstm": m.cell}) sv = tf.train.Supervisor(logdir=FLAGS.save_path) with sv.managed_session() as session: for i in range(config.max_max_epoch): @@ -398,12 +408,12 @@ def main(_): train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) -# print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) -# valid_perplexity = run_epoch(session, mvalid) -# print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) + print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) + valid_perplexity = run_epoch(session, mvalid) + print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) -# test_perplexity = run_epoch(session, mtest) -# print("Test Perplexity: %.3f" % test_perplexity) + test_perplexity = run_epoch(session, mtest) + print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: # saver = tf.train.Saver() diff --git a/egs/ami/s5/local/tensorflow/run.sh b/egs/ami/s5/local/tensorflow/run.sh index 22947616967..5baa2337741 100755 --- a/egs/ami/s5/local/tensorflow/run.sh +++ b/egs/ami/s5/local/tensorflow/run.sh @@ -1,16 +1,15 @@ #!/bin/bash data_type=sdm1 -model_type=test +model_type=small dir=data/tensorflow/ mkdir -p $dir -#cat data/$data_type/train/text | awk '{for(i=2;i<=NF;i++)print $i}' | sort | uniq -c | sort -k1nr | head -n 9998 | awk '{print $2}' > $dir/wordlist -# -#for i in train dev eval; do -# cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" > $dir/$i.txt -#done +cat data/$data_type/train/text | awk '{for(i=2;i<=NF;i++)print $i}' | sort | uniq -c | sort -k1nr | head -n 9998 | awk '{print $2}' > $dir/wordlist + +for i in train dev eval; do + cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" > $dir/$i.txt +done -#python local/tensorflow/ptb_word_lm.py --data_path=$dir --model=$model_type --save_path=$dir/model python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/model From 1df10a84c3bcc47c76609ca636ed24dec758002b Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Tue, 6 Jun 2017 14:03:58 -0400 Subject: [PATCH 06/30] add script to install bazel --- tools/install_tensorflow.sh | 40 +++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100755 tools/install_tensorflow.sh diff --git a/tools/install_tensorflow.sh b/tools/install_tensorflow.sh new file mode 100755 index 00000000000..e5a7513063f --- /dev/null +++ b/tools/install_tensorflow.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +export JAVA_HOME=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121 +export PATH=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121/bin/:$PATH + +#git clone https://github.com/tensorflow/tensorflow + +#cd tensorflow +# +#git checkout r1.0 +# +#cd ../ +# +##git clone https://github.com/google/bazel/ + +[ ! -f bazel-0.4.5-dist.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.4.5/bazel-0.4.5-dist.zip +mkdir -p bazel +cd bazel + +unzip ../bazel-0.4.5-dist.zip + +./compile.sh + +#mkdir build +#./compile.sh compile build/ + + + + + + + + + + + + + + + From bfb4ad2c9cd8bb5b861dc66e80917bd2573ee59e Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Tue, 6 Jun 2017 15:26:57 -0400 Subject: [PATCH 07/30] add script to compile tensorflow with simple RNN c++example --- egs/ami/s5/local/tensorflow/rnnlm.py | 4 +- egs/ami/s5/local/tensorflow/run.sh | 13 ++-- src/tensorflow/loader_rnn.cc | 100 +++++++++++++++++++++++++++ tools/install_tensorflow.sh | 37 ++++++---- 4 files changed, 135 insertions(+), 19 deletions(-) create mode 100644 src/tensorflow/loader_rnn.cc diff --git a/egs/ami/s5/local/tensorflow/rnnlm.py b/egs/ami/s5/local/tensorflow/rnnlm.py index 7fff2c7d2b4..dfc058e309d 100644 --- a/egs/ami/s5/local/tensorflow/rnnlm.py +++ b/egs/ami/s5/local/tensorflow/rnnlm.py @@ -258,8 +258,8 @@ class SmallConfig(object): num_layers = 2 num_steps = 20 hidden_size = 200 - max_epoch = 1 #4 - max_max_epoch = 1 #13 + max_epoch = 4 + max_max_epoch = 13 keep_prob = 1.0 lr_decay = 0.5 batch_size = 20 diff --git a/egs/ami/s5/local/tensorflow/run.sh b/egs/ami/s5/local/tensorflow/run.sh index 5baa2337741..dfc5a0749ba 100755 --- a/egs/ami/s5/local/tensorflow/run.sh +++ b/egs/ami/s5/local/tensorflow/run.sh @@ -6,10 +6,13 @@ model_type=small dir=data/tensorflow/ mkdir -p $dir -cat data/$data_type/train/text | awk '{for(i=2;i<=NF;i++)print $i}' | sort | uniq -c | sort -k1nr | head -n 9998 | awk '{print $2}' > $dir/wordlist +#cat data/$data_type/train/text | awk '{for(i=2;i<=NF;i++)print $i}' | sort | uniq -c | sort -k1nr | head -n 9998 | awk '{print $2}' > $dir/wordlist +# +#for i in train dev eval; do +# cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" > $dir/$i.txt +#done -for i in train dev eval; do - cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" > $dir/$i.txt -done -python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/model +#python local/tensorflow/rnnlm.py --data_path=$dir --model=small --save_path=$dir/model.small +python local/tensorflow/rnnlm.py --data_path=$dir --model=medium --save_path=$dir/model.medium +#python local/tensorflow/rnnlm.py --data_path=$dir --model=large --save_path=$dir/model.large diff --git a/src/tensorflow/loader_rnn.cc b/src/tensorflow/loader_rnn.cc new file mode 100644 index 00000000000..33b6fcb3c5d --- /dev/null +++ b/src/tensorflow/loader_rnn.cc @@ -0,0 +1,100 @@ +#include "tensorflow/core/public/session.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/protobuf/meta_graph.pb.h" + +using namespace tensorflow; + +int main(int argc, char* argv[]) { + // Initialize a tensorflow session + Session* session; + Status status = NewSession(SessionOptions(), &session); + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + const string pathToGraph = "/export/b02/hxu/TensorFlow/kaldi/egs/ami/s5/data/tensorflow/model.small.meta"; + const string checkpointPath = "/export/b02/hxu/TensorFlow/kaldi/egs/ami/s5/data/tensorflow/model.small"; + + // Read in the protobuf graph we exported + // (The path seems to be relative to the cwd. Keep this in mind + // when using `bazel run` since the cwd isn't where you call + // `bazel run` but from inside a temp folder.) + MetaGraphDef graph_def; + status = ReadBinaryProto(Env::Default(), pathToGraph, &graph_def); + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + // Add the graph to the session + status = session->Create(graph_def.graph_def()); + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + Tensor checkpointPathTensor(DT_STRING, TensorShape()); + checkpointPathTensor.scalar()() = checkpointPath; + + status = session->Run( + {{ graph_def.saver_def().filename_tensor_name(), checkpointPathTensor },}, + {}, + {graph_def.saver_def().restore_op_name()}, + nullptr); + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + // Setup inputs and outputs: + std::vector state; +// std::vector state(DT_FLOAT, {2, 2, 1, 200}); + status = session->Run(std::vector>(), {"Train/Model/test_initial_state"}, {}, &state); + + for (int32 word_out = 0; word_out < 10000; word_out++) { + Tensor in_word(DT_INT32, {1, 1}); + in_word.scalar()() = (word_out + 9999) % 10000; + + Tensor out_word(DT_INT32, {1, 1}); + out_word.scalar()() = word_out; + + // num-layers + // 2 (c and h) + // 1 (batchsize) + // hidden-size + + std::vector> inputs = { + {"Train/Model/test_word_in", in_word}, + {"Train/Model/test_word_out", out_word}, + {"Train/Model/test_state", state[0]}, + }; + + // The session will initialize the outputs + std::vector outputs; + + // Run the session, evaluating our "c" operation from the graph + status = session->Run(inputs, {"Train/Model/test_out", "Train/Model/test_state_out"}, {}, &outputs); + + if (!status.ok()) { + std::cout << status.ToString() << "\n"; + return 1; + } + + // Grab the first output (we only evaluated one graph node: "c") + // and convert the node to a scalar representation. + + // (There are similar methods for vectors and matrices here: + // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/tensor.h) + + // Print the results + std::cout << word_out << ": " << outputs[0].DebugString() << "\n"; // Tensor + std::cout << word_out << ": " << outputs[1].DebugString() << "\n"; // Tensor + state[0] = outputs[1]; +// std::cout << output_c() << "\n"; // 30 + } + + // Free any resources used by the session + session->Close(); + return 0; +} diff --git a/tools/install_tensorflow.sh b/tools/install_tensorflow.sh index e5a7513063f..f2bc7e9a9e1 100755 --- a/tools/install_tensorflow.sh +++ b/tools/install_tensorflow.sh @@ -1,30 +1,43 @@ #!/bin/bash +set -e + +export HOME=/export/b02/hxu export JAVA_HOME=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121 export PATH=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121/bin/:$PATH -#git clone https://github.com/tensorflow/tensorflow +git clone https://github.com/tensorflow/tensorflow -#cd tensorflow -# -#git checkout r1.0 -# -#cd ../ -# -##git clone https://github.com/google/bazel/ [ ! -f bazel-0.4.5-dist.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.4.5/bazel-0.4.5-dist.zip mkdir -p bazel cd bazel - unzip ../bazel-0.4.5-dist.zip - ./compile.sh +cd ../ + +# now bazel is built + +export PATH=$PWD/bazel/output/:$PATH + +cd tensorflow + +./configure + +cd ../ + +cd tensorflow/tensorflow +mkdir -p rnnlm +cd rnnlm -#mkdir build -#./compile.sh compile build/ +[ ! -f BUILD ] && ln -s ../../../../src/tensorflow/BUILD +[ ! -f loader_rnn.cc ] && ln -s ../../../../src/tensorflow/loader_rnn.cc +TEST_TMPDIR=tensorflow/build +echo bazel build :loader_rnn +bazel build --test_tmpdir=$TEST_TMPDIR :loader_rnn +bazel run -c opt :loader_rnn From 0c4b2b4e957b876df3b0e1d38e8bef0b5a2fa389 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Wed, 7 Jun 2017 18:23:30 -0400 Subject: [PATCH 08/30] more files added --- src/tensorflow/BUILD | 9 +++++++ src/tensorflow/loader_rnn.cc | 10 ++++++++ tools/install_tensorflow.sh | 49 ++++++++++++++++-------------------- 3 files changed, 41 insertions(+), 27 deletions(-) create mode 100644 src/tensorflow/BUILD diff --git a/src/tensorflow/BUILD b/src/tensorflow/BUILD new file mode 100644 index 00000000000..a60fdbbc3ec --- /dev/null +++ b/src/tensorflow/BUILD @@ -0,0 +1,9 @@ +cc_binary( + name = "loader_rnn", + srcs = ["loader_rnn.cc"], + deps = [ + "//tensorflow/core:tensorflow", +# "//kaldi/base/libkaldi-base.so", + ] +) + diff --git a/src/tensorflow/loader_rnn.cc b/src/tensorflow/loader_rnn.cc index 33b6fcb3c5d..6f80fa17994 100644 --- a/src/tensorflow/loader_rnn.cc +++ b/src/tensorflow/loader_rnn.cc @@ -2,9 +2,18 @@ #include "tensorflow/core/platform/env.h" #include "tensorflow/core/protobuf/meta_graph.pb.h" +#include "base/kaldi-common.h" +//#include "fstext/fstext-lib.h" +//#include "lat/kaldi-lattice.h" +//#include "lat/lattice-functions.h" +//#include "lm/kaldi-rnnlm.h" +//#include "lm/mikolov-rnnlm-lib.h" +//#include "util/common-utils.h" + using namespace tensorflow; int main(int argc, char* argv[]) { + /* // Initialize a tensorflow session Session* session; Status status = NewSession(SessionOptions(), &session); @@ -96,5 +105,6 @@ int main(int argc, char* argv[]) { // Free any resources used by the session session->Close(); + // */ return 0; } diff --git a/tools/install_tensorflow.sh b/tools/install_tensorflow.sh index f2bc7e9a9e1..b07a636e3c2 100755 --- a/tools/install_tensorflow.sh +++ b/tools/install_tensorflow.sh @@ -5,49 +5,44 @@ set -e export HOME=/export/b02/hxu export JAVA_HOME=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121 export PATH=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121/bin/:$PATH +export PATH=$PWD/bazel/output/:$PATH +#export PATH=$PWD/tensorflow/bazel-out/host/bin/external/protobuf/:$PATH +export PATH=$PWD:$PATH -git clone https://github.com/tensorflow/tensorflow - +echo which protoc +which protoc -[ ! -f bazel-0.4.5-dist.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.4.5/bazel-0.4.5-dist.zip -mkdir -p bazel -cd bazel -unzip ../bazel-0.4.5-dist.zip -./compile.sh -cd ../ +#git clone https://github.com/tensorflow/tensorflow +[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.5.1/bazel-0.5.1-dist.zip -O bazel.zip +#mkdir -p bazel +#cd bazel +#unzip ../bazel.zip +#./compile.sh +#cd ../ # now bazel is built - -export PATH=$PWD/bazel/output/:$PATH +git clone https://github.com/tensorflow/tensorflow cd tensorflow ./configure -cd ../ +#bazel build //tensorflow/core:framework_headers_lib +# +#bazel build //tensorflow:libtensorflow.so +bazel build //tensorflow:libtensorflow_cc.so + +exit cd tensorflow/tensorflow mkdir -p rnnlm cd rnnlm [ ! -f BUILD ] && ln -s ../../../../src/tensorflow/BUILD +[ ! -f WORKSPACE ] && ln -s ../../../../src/tensorflow/WORKSPACE [ ! -f loader_rnn.cc ] && ln -s ../../../../src/tensorflow/loader_rnn.cc +[ ! -d kaldi_src ] && ln -s ../../../../src/ kaldi_src -TEST_TMPDIR=tensorflow/build - -echo bazel build :loader_rnn bazel build --test_tmpdir=$TEST_TMPDIR :loader_rnn -bazel run -c opt :loader_rnn - - - - - - - - - - - - +#bazel run -c opt :loader_rnn From 066cc74dc6c68ffa6e7b717874c6d227cb463c7c Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Wed, 7 Jun 2017 18:38:31 -0400 Subject: [PATCH 09/30] change for spoken machines --- tools/install_tensorflow.sh | 43 +++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/tools/install_tensorflow.sh b/tools/install_tensorflow.sh index b07a636e3c2..ba933225162 100755 --- a/tools/install_tensorflow.sh +++ b/tools/install_tensorflow.sh @@ -2,18 +2,15 @@ set -e -export HOME=/export/b02/hxu -export JAVA_HOME=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121 -export PATH=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121/bin/:$PATH +export HOME=/home/hainanx/work +#export JAVA_HOME=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121 +#export PATH=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121/bin/:$PATH export PATH=$PWD/bazel/output/:$PATH #export PATH=$PWD/tensorflow/bazel-out/host/bin/external/protobuf/:$PATH export PATH=$PWD:$PATH -echo which protoc -which protoc - #git clone https://github.com/tensorflow/tensorflow -[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.5.1/bazel-0.5.1-dist.zip -O bazel.zip +#[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.5.1/bazel-0.5.1-dist.zip -O bazel.zip #mkdir -p bazel #cd bazel #unzip ../bazel.zip @@ -21,28 +18,28 @@ which protoc #cd ../ # now bazel is built -git clone https://github.com/tensorflow/tensorflow +#git clone https://github.com/tensorflow/tensorflow cd tensorflow -./configure +#./configure #bazel build //tensorflow/core:framework_headers_lib # #bazel build //tensorflow:libtensorflow.so bazel build //tensorflow:libtensorflow_cc.so -exit - -cd tensorflow/tensorflow -mkdir -p rnnlm -cd rnnlm - -[ ! -f BUILD ] && ln -s ../../../../src/tensorflow/BUILD -[ ! -f WORKSPACE ] && ln -s ../../../../src/tensorflow/WORKSPACE -[ ! -f loader_rnn.cc ] && ln -s ../../../../src/tensorflow/loader_rnn.cc -[ ! -d kaldi_src ] && ln -s ../../../../src/ kaldi_src - -bazel build --test_tmpdir=$TEST_TMPDIR :loader_rnn -#bazel run -c opt :loader_rnn - +#exit +# +#cd tensorflow/tensorflow +#mkdir -p rnnlm +#cd rnnlm +# +#[ ! -f BUILD ] && ln -s ../../../../src/tensorflow/BUILD +#[ ! -f WORKSPACE ] && ln -s ../../../../src/tensorflow/WORKSPACE +#[ ! -f loader_rnn.cc ] && ln -s ../../../../src/tensorflow/loader_rnn.cc +#[ ! -d kaldi_src ] && ln -s ../../../../src/ kaldi_src +# +#bazel build --test_tmpdir=$TEST_TMPDIR :loader_rnn +##bazel run -c opt :loader_rnn +# From 4d27c7d1a2df9a3763ff17403619a9e229f28ab0 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Wed, 7 Jun 2017 19:10:08 -0400 Subject: [PATCH 10/30] add makefile --- src/tensorflow/Makefile | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/tensorflow/Makefile diff --git a/src/tensorflow/Makefile b/src/tensorflow/Makefile new file mode 100644 index 00000000000..fd0e02458dd --- /dev/null +++ b/src/tensorflow/Makefile @@ -0,0 +1,28 @@ +include ../kaldi.mk + +TENSORFLOW = ../../tools/tensorflow +BAZEL = ../../tools/bazel +KALDI_ROOT = ../../ +all: + +#EXTRA_CXXFLAGS = -Wno-sign-compare -fPIC -I ../ -I $(KALDI_ROOT)/src -DKALDI_NO_PORTAUDIO -I $(TENSORFLOW)/third_party/eigen3 -I $(BAZEL)/third_party/protobuf/3.0.0/src/ \ +# -I$(TENSORFLOW)/bazel-genfiles -I $(TENSORFLOW) \ +# -I $(TENSORFLOW)/tensorflow/contrib/makefile/gen/protobuf/include/ -I $(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \ + +EXTRA_CXXFLAGS = -Wno-sign-compare -fPIC -I ../ -I $(KALDI_ROOT)/src -DKALDI_NO_PORTAUDIO -I $(BAZEL)/third_party/protobuf/3.0.0/src \ + -I$(TENSORFLOW)/bazel-genfiles -I $(TENSORFLOW) \ + -I $(TENSORFLOW)/tensorflow/contrib/makefile/gen/protobuf/include/ -I $(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \ + -I $(TENSORFLOW)/bazel-out/host/bin/external/protobuf/ +BINFILES = loader_rnn + +OBJFILES = + +TESTFILES = + +ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ + ../matrix/kaldi-matrix.a ../base/kaldi-base.a + +LDLIBS += -lz -ldl -fPIC -lrt +LDLIBS += $(OTHERLIBS) -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow + +include ../makefiles/default_rules.mk From 96e5a2b9f56f3f8dab9e647ced7dad7366bc5af1 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Fri, 9 Jun 2017 18:38:02 -0400 Subject: [PATCH 11/30] tf compiles with kaldi --- src/tensorflow/BUILD | 9 ------- src/tensorflow/Makefile | 12 ++++----- src/tensorflow/loader_rnn.cc | 14 +++++----- tools/install_tensorflow.sh | 51 +++++++++++------------------------- 4 files changed, 27 insertions(+), 59 deletions(-) delete mode 100644 src/tensorflow/BUILD diff --git a/src/tensorflow/BUILD b/src/tensorflow/BUILD deleted file mode 100644 index a60fdbbc3ec..00000000000 --- a/src/tensorflow/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -cc_binary( - name = "loader_rnn", - srcs = ["loader_rnn.cc"], - deps = [ - "//tensorflow/core:tensorflow", -# "//kaldi/base/libkaldi-base.so", - ] -) - diff --git a/src/tensorflow/Makefile b/src/tensorflow/Makefile index fd0e02458dd..f214a66f88a 100644 --- a/src/tensorflow/Makefile +++ b/src/tensorflow/Makefile @@ -1,7 +1,6 @@ include ../kaldi.mk TENSORFLOW = ../../tools/tensorflow -BAZEL = ../../tools/bazel KALDI_ROOT = ../../ all: @@ -9,10 +8,9 @@ all: # -I$(TENSORFLOW)/bazel-genfiles -I $(TENSORFLOW) \ # -I $(TENSORFLOW)/tensorflow/contrib/makefile/gen/protobuf/include/ -I $(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \ -EXTRA_CXXFLAGS = -Wno-sign-compare -fPIC -I ../ -I $(KALDI_ROOT)/src -DKALDI_NO_PORTAUDIO -I $(BAZEL)/third_party/protobuf/3.0.0/src \ - -I$(TENSORFLOW)/bazel-genfiles -I $(TENSORFLOW) \ - -I $(TENSORFLOW)/tensorflow/contrib/makefile/gen/protobuf/include/ -I $(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \ - -I $(TENSORFLOW)/bazel-out/host/bin/external/protobuf/ +EXTRA_CXXFLAGS = -Wno-sign-compare -fPIC -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen/ +#EXTRA_CXXFLAGS = -Wno-sign-compare -fPIC -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) -I$(TENSORFLOW)/third_party/eigen3 + BINFILES = loader_rnn OBJFILES = @@ -20,9 +18,9 @@ OBJFILES = TESTFILES = ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a + ../matrix/kaldi-matrix.a ../base/kaldi-base.a ../../tools/tensorflow/bazel-bin/tensorflow/tensorflow_cc.so LDLIBS += -lz -ldl -fPIC -lrt -LDLIBS += $(OTHERLIBS) -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow +LDLIBS += $(OTHERLIBS) -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc include ../makefiles/default_rules.mk diff --git a/src/tensorflow/loader_rnn.cc b/src/tensorflow/loader_rnn.cc index 6f80fa17994..7034b920060 100644 --- a/src/tensorflow/loader_rnn.cc +++ b/src/tensorflow/loader_rnn.cc @@ -3,17 +3,17 @@ #include "tensorflow/core/protobuf/meta_graph.pb.h" #include "base/kaldi-common.h" -//#include "fstext/fstext-lib.h" -//#include "lat/kaldi-lattice.h" -//#include "lat/lattice-functions.h" -//#include "lm/kaldi-rnnlm.h" -//#include "lm/mikolov-rnnlm-lib.h" -//#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" +#include "lm/kaldi-rnnlm.h" +#include "lm/mikolov-rnnlm-lib.h" +#include "util/common-utils.h" using namespace tensorflow; int main(int argc, char* argv[]) { - /* +//* // Initialize a tensorflow session Session* session; Status status = NewSession(SessionOptions(), &session); diff --git a/tools/install_tensorflow.sh b/tools/install_tensorflow.sh index ba933225162..7593d486fa4 100755 --- a/tools/install_tensorflow.sh +++ b/tools/install_tensorflow.sh @@ -2,44 +2,23 @@ set -e -export HOME=/home/hainanx/work -#export JAVA_HOME=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121 -#export PATH=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121/bin/:$PATH +export HOME=/export/b02/hxu +export JAVA_HOME=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121 +export PATH=/export/b02/hxu/TensorFlow/java/jdk1.8.0_121/bin/:$PATH export PATH=$PWD/bazel/output/:$PATH -#export PATH=$PWD/tensorflow/bazel-out/host/bin/external/protobuf/:$PATH -export PATH=$PWD:$PATH -#git clone https://github.com/tensorflow/tensorflow -#[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.5.1/bazel-0.5.1-dist.zip -O bazel.zip -#mkdir -p bazel -#cd bazel -#unzip ../bazel.zip -#./compile.sh -#cd ../ - -# now bazel is built -#git clone https://github.com/tensorflow/tensorflow +[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.5.1/bazel-0.5.1-dist.zip -O bazel.zip +mkdir -p bazel +cd bazel +unzip ../bazel.zip +./compile.sh +cd ../ +## now bazel is built +git clone https://github.com/tensorflow/tensorflow cd tensorflow +./configure -#./configure - -#bazel build //tensorflow/core:framework_headers_lib -# -#bazel build //tensorflow:libtensorflow.so -bazel build //tensorflow:libtensorflow_cc.so - -#exit -# -#cd tensorflow/tensorflow -#mkdir -p rnnlm -#cd rnnlm -# -#[ ! -f BUILD ] && ln -s ../../../../src/tensorflow/BUILD -#[ ! -f WORKSPACE ] && ln -s ../../../../src/tensorflow/WORKSPACE -#[ ! -f loader_rnn.cc ] && ln -s ../../../../src/tensorflow/loader_rnn.cc -#[ ! -d kaldi_src ] && ln -s ../../../../src/ kaldi_src -# -#bazel build --test_tmpdir=$TEST_TMPDIR :loader_rnn -##bazel run -c opt :loader_rnn -# +tensorflow/contrib/makefile/download_dependencies.sh +bazel build //tensorflow:libtensorflow.so +#bazel build //tensorflow:libtensorflow_cc.so From 85fd7b2287ec9576ed6eb7668e22c679d4f1b0ce Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Mon, 12 Jun 2017 15:50:17 -0400 Subject: [PATCH 12/30] starting to write the tensorflow wrappers --- egs/ami/s5/local/tensorflow/ptb_word_lm.py | 1 - egs/ami/s5/local/tensorflow/reader.py | 5 +++-- egs/ami/s5/local/tensorflow/rnnlm.py | 9 ++++++++- egs/ami/s5/local/tensorflow/run.sh | 19 ++++++++++++------- src/tensorflow/Makefile | 16 ++++++---------- src/{tensorflow => tfbin}/loader_rnn.cc | 4 ++-- 6 files changed, 31 insertions(+), 23 deletions(-) rename src/{tensorflow => tfbin}/loader_rnn.cc (97%) diff --git a/egs/ami/s5/local/tensorflow/ptb_word_lm.py b/egs/ami/s5/local/tensorflow/ptb_word_lm.py index 1c48632f8d1..15040fd30ea 100644 --- a/egs/ami/s5/local/tensorflow/ptb_word_lm.py +++ b/egs/ami/s5/local/tensorflow/ptb_word_lm.py @@ -49,7 +49,6 @@ from __future__ import print_function import sys - sys.path.insert(0,"/home/hxu/.local/lib/python2.7/site-packages/") import inspect diff --git a/egs/ami/s5/local/tensorflow/reader.py b/egs/ami/s5/local/tensorflow/reader.py index 5ec03b19b51..6e02fcc8be4 100644 --- a/egs/ami/s5/local/tensorflow/reader.py +++ b/egs/ami/s5/local/tensorflow/reader.py @@ -28,7 +28,8 @@ def _read_words(filename): with tf.gfile.GFile(filename, "r") as f: - return f.read().decode("utf-8").replace("\n", "").split() + return f.read().decode("utf-8").split() +# return f.read().decode("utf-8").replace("\n", "").split() def _build_vocab(filename): @@ -76,7 +77,7 @@ def ptb_raw_data(data_path=None): valid_data = _file_to_word_ids(valid_path, word_to_id) test_data = _file_to_word_ids(test_path, word_to_id) vocabulary = len(word_to_id) - return train_data, valid_data, test_data, vocabulary + return train_data, valid_data, test_data, vocabulary, word_to_id def ptb_producer(raw_data, batch_size, num_steps, name=None): diff --git a/egs/ami/s5/local/tensorflow/rnnlm.py b/egs/ami/s5/local/tensorflow/rnnlm.py index dfc058e309d..6707755c214 100644 --- a/egs/ami/s5/local/tensorflow/rnnlm.py +++ b/egs/ami/s5/local/tensorflow/rnnlm.py @@ -70,6 +70,8 @@ "Where the training/test data is stored.") flags.DEFINE_string("save_path", None, "Model output directory.") +flags.DEFINE_string("wordlist_save_path", None, + "wordmap output directory.") flags.DEFINE_bool("use_fp16", False, "Train using 16-bit floats instead of 32bit floats") @@ -367,7 +369,12 @@ def main(_): raise ValueError("Must set --data_path to PTB data directory") raw_data = reader.ptb_raw_data(FLAGS.data_path) - train_data, valid_data, test_data, _ = raw_data + train_data, valid_data, test_data, _, word_map = raw_data + + with open(FLAGS.wordlist_save_path, "w") as wmap_file: + count_pairs = sorted(word_map.items(), key=lambda x: (x[1], x[0])) + for k, v in count_pairs: + wmap_file.write(str(k) + " " + str(v) + "\n") config = get_config() eval_config = get_config() diff --git a/egs/ami/s5/local/tensorflow/run.sh b/egs/ami/s5/local/tensorflow/run.sh index dfc5a0749ba..0e576796338 100755 --- a/egs/ami/s5/local/tensorflow/run.sh +++ b/egs/ami/s5/local/tensorflow/run.sh @@ -6,13 +6,18 @@ model_type=small dir=data/tensorflow/ mkdir -p $dir -#cat data/$data_type/train/text | awk '{for(i=2;i<=NF;i++)print $i}' | sort | uniq -c | sort -k1nr | head -n 9998 | awk '{print $2}' > $dir/wordlist -# -#for i in train dev eval; do -# cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" > $dir/$i.txt -#done +#echo "" > $dir/wordlist +#echo "" >> $dir/wordlist +# num-words is 10000 - 3 (bos, eos and ) -#python local/tensorflow/rnnlm.py --data_path=$dir --model=small --save_path=$dir/model.small -python local/tensorflow/rnnlm.py --data_path=$dir --model=medium --save_path=$dir/model.medium +cat data/$data_type/train/text | awk '{for(i=2;i<=NF;i++)print $i}' | sort | uniq -c | sort -k1nr | head -n 9997 | awk '{print $2}' > $dir/wordlist + +for i in train dev eval; do + cat data/$data_type/$i/text | awk -v w=$dir/wordlist 'BEGIN{while((getline0)d[$1]=1}{for(i=2;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" | sed "s=$= =" > $dir/$i.txt +done + + +python local/tensorflow/rnnlm.py --data_path=$dir --model=small --save_path=$dir/model.small --wordlist_save_path=$dir/wordlist.rnn +#python local/tensorflow/rnnlm.py --data_path=$dir --model=medium --save_path=$dir/model.medium #python local/tensorflow/rnnlm.py --data_path=$dir --model=large --save_path=$dir/model.large diff --git a/src/tensorflow/Makefile b/src/tensorflow/Makefile index f214a66f88a..c9ae405d8f2 100644 --- a/src/tensorflow/Makefile +++ b/src/tensorflow/Makefile @@ -1,22 +1,18 @@ include ../kaldi.mk TENSORFLOW = ../../tools/tensorflow -KALDI_ROOT = ../../ -all: - -#EXTRA_CXXFLAGS = -Wno-sign-compare -fPIC -I ../ -I $(KALDI_ROOT)/src -DKALDI_NO_PORTAUDIO -I $(TENSORFLOW)/third_party/eigen3 -I $(BAZEL)/third_party/protobuf/3.0.0/src/ \ -# -I$(TENSORFLOW)/bazel-genfiles -I $(TENSORFLOW) \ -# -I $(TENSORFLOW)/tensorflow/contrib/makefile/gen/protobuf/include/ -I $(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \ -EXTRA_CXXFLAGS = -Wno-sign-compare -fPIC -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen/ -#EXTRA_CXXFLAGS = -Wno-sign-compare -fPIC -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) -I$(TENSORFLOW)/third_party/eigen3 +all: -BINFILES = loader_rnn +EXTRA_CXXFLAGS = -Wno-sign-compare -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen/ +#EXTRA_CXXFLAGS = -Wno-sign-compare -fPIC -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen/ -OBJFILES = +OBJFILES = tensorflow-rnnlm-lib.o TESTFILES = +LIBNAME = kaldi-tensorflow-rnnlm + ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ ../matrix/kaldi-matrix.a ../base/kaldi-base.a ../../tools/tensorflow/bazel-bin/tensorflow/tensorflow_cc.so diff --git a/src/tensorflow/loader_rnn.cc b/src/tfbin/loader_rnn.cc similarity index 97% rename from src/tensorflow/loader_rnn.cc rename to src/tfbin/loader_rnn.cc index 7034b920060..45ddc89fc64 100644 --- a/src/tensorflow/loader_rnn.cc +++ b/src/tfbin/loader_rnn.cc @@ -62,10 +62,10 @@ int main(int argc, char* argv[]) { status = session->Run(std::vector>(), {"Train/Model/test_initial_state"}, {}, &state); for (int32 word_out = 0; word_out < 10000; word_out++) { - Tensor in_word(DT_INT32, {1, 1}); + Tensor in_word(tensorflow::DT_INT32, {1, 1}); in_word.scalar()() = (word_out + 9999) % 10000; - Tensor out_word(DT_INT32, {1, 1}); + Tensor out_word(tensorflow::DT_INT32, {1, 1}); out_word.scalar()() = word_out; // num-layers From 5c19b09f45313df82e6de1f8ee90e2fd5f323691 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Mon, 12 Jun 2017 15:53:46 -0400 Subject: [PATCH 13/30] include the h and cc files; delete some of the unuseful files --- egs/ami/s5/local/tensorflow/load.py | 22 -- egs/ami/s5/local/tensorflow/loader.cc | 88 ----- egs/ami/s5/local/tensorflow/ptb_word_lm.py | 409 --------------------- src/tensorflow/tensorflow-rnnlm-lib.cc | 195 ++++++++++ src/tensorflow/tensorflow-rnnlm-lib.h | 103 ++++++ 5 files changed, 298 insertions(+), 519 deletions(-) delete mode 100644 egs/ami/s5/local/tensorflow/load.py delete mode 100644 egs/ami/s5/local/tensorflow/loader.cc delete mode 100644 egs/ami/s5/local/tensorflow/ptb_word_lm.py create mode 100644 src/tensorflow/tensorflow-rnnlm-lib.cc create mode 100644 src/tensorflow/tensorflow-rnnlm-lib.h diff --git a/egs/ami/s5/local/tensorflow/load.py b/egs/ami/s5/local/tensorflow/load.py deleted file mode 100644 index 0d0959aa746..00000000000 --- a/egs/ami/s5/local/tensorflow/load.py +++ /dev/null @@ -1,22 +0,0 @@ -import sys - -sys.path.insert(0,"/home/hxu/.local/lib/python2.7/site-packages/") - -import tensorflow as tf -import numpy as np -#config = tf.ConfigProto(device_count = {'GPU': 0} ) - -#with tf.Session(config=config) as sess: -with tf.Session() as sess: - a = tf.Variable(5.5, name='a') - b = tf.Variable(6.6, name='b') - c = tf.multiply(a, b, name="c") - - sess.run(tf.global_variables_initializer()) - - print a.eval() # 5.0 - print b.eval() # 6.0 - print c.eval() # 30.0 - - tf.train.write_graph(sess.graph_def, 'models/', 'graph.pb', as_text=False) - diff --git a/egs/ami/s5/local/tensorflow/loader.cc b/egs/ami/s5/local/tensorflow/loader.cc deleted file mode 100644 index b02b1f4b853..00000000000 --- a/egs/ami/s5/local/tensorflow/loader.cc +++ /dev/null @@ -1,88 +0,0 @@ -#include "tensorflow/core/public/session.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/core/protobuf/meta_graph.pb.h" - - -using namespace tensorflow; - -int main(int argc, char* argv[]) { - // Initialize a tensorflow session - Session* session; - Status status = NewSession(SessionOptions(), &session); - if (!status.ok()) { - std::cout << status.ToString() << "\n"; - return 1; - } - - const string pathToGraph = "/export/b02/hxu/TensorFlow/save_load/models/m.meta"; - const string checkpointPath = "/export/b02/hxu/TensorFlow/save_load/models/m"; - - // Read in the protobuf graph we exported - // (The path seems to be relative to the cwd. Keep this in mind - // when using `bazel run` since the cwd isn't where you call - // `bazel run` but from inside a temp folder.) - MetaGraphDef graph_def; - status = ReadBinaryProto(Env::Default(), pathToGraph, &graph_def); - if (!status.ok()) { - std::cout << status.ToString() << "\n"; - return 1; - } - - // Add the graph to the session - status = session->Create(graph_def.graph_def()); - if (!status.ok()) { - std::cout << status.ToString() << "\n"; - return 1; - } - - Tensor checkpointPathTensor(DT_STRING, TensorShape()); - checkpointPathTensor.scalar()() = checkpointPath; - - status = session->Run( - {{ graph_def.saver_def().filename_tensor_name(), checkpointPathTensor },}, - {}, - {graph_def.saver_def().restore_op_name()}, - nullptr); - if (!status.ok()) { - std::cout << status.ToString() << "\n"; - return 1; - } - - // Setup inputs and outputs: - - Tensor a(DT_FLOAT, TensorShape()); - a.scalar()() = 5.5; - - Tensor b(DT_FLOAT, TensorShape()); - b.scalar()() = 6.6; - - std::vector> inputs = { - { "a", a }, - { "b", b }, - }; - - // The session will initialize the outputs - std::vector outputs; - - // Run the session, evaluating our "c" operation from the graph - status = session->Run(inputs, {"output"}, {}, &outputs); - if (!status.ok()) { - std::cout << status.ToString() << "\n"; - return 1; - } - - // Grab the first output (we only evaluated one graph node: "c") - // and convert the node to a scalar representation. - auto output_c = outputs[0].scalar(); - - // (There are similar methods for vectors and matrices here: - // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/tensor.h) - - // Print the results - std::cout << outputs[0].DebugString() << "\n"; // Tensor - std::cout << output_c() << "\n"; // 30 - - // Free any resources used by the session - session->Close(); - return 0; -} diff --git a/egs/ami/s5/local/tensorflow/ptb_word_lm.py b/egs/ami/s5/local/tensorflow/ptb_word_lm.py deleted file mode 100644 index 15040fd30ea..00000000000 --- a/egs/ami/s5/local/tensorflow/ptb_word_lm.py +++ /dev/null @@ -1,409 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Example / benchmark for building a PTB LSTM model. -Trains the model described in: -(Zaremba, et. al.) Recurrent Neural Network Regularization -http://arxiv.org/abs/1409.2329 -There are 3 supported model configurations: -=========================================== -| config | epochs | train | valid | test -=========================================== -| small | 13 | 37.99 | 121.39 | 115.91 -| medium | 39 | 48.45 | 86.16 | 82.07 -| large | 55 | 37.87 | 82.62 | 78.29 -The exact results may vary depending on the random initialization. -The hyperparameters used in the model: -- init_scale - the initial scale of the weights -- learning_rate - the initial value of the learning rate -- max_grad_norm - the maximum permissible norm of the gradient -- num_layers - the number of LSTM layers -- num_steps - the number of unrolled steps of LSTM -- hidden_size - the number of LSTM units -- max_epoch - the number of epochs trained with the initial learning rate -- max_max_epoch - the total number of epochs for training -- keep_prob - the probability of keeping weights in the dropout layer -- lr_decay - the decay of the learning rate for each epoch after "max_epoch" -- batch_size - the batch size -The data required for this example is in the data/ dir of the -PTB dataset from Tomas Mikolov's webpage: -$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz -$ tar xvf simple-examples.tgz -To run: -$ python ptb_word_lm.py --data_path=simple-examples/data/ -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import sys -sys.path.insert(0,"/home/hxu/.local/lib/python2.7/site-packages/") - -import inspect -import time - -import numpy as np -import tensorflow as tf - -import reader - -flags = tf.flags -logging = tf.logging - -flags.DEFINE_string( - "model", "small", - "A type of model. Possible options are: small, medium, large.") -flags.DEFINE_string("data_path", None, - "Where the training/test data is stored.") -flags.DEFINE_string("save_path", None, - "Model output directory.") -flags.DEFINE_bool("use_fp16", False, - "Train using 16-bit floats instead of 32bit floats") - -FLAGS = flags.FLAGS - - -def data_type(): - return tf.float16 if FLAGS.use_fp16 else tf.float32 - - -class PTBInput(object): - """The input data.""" - - def __init__(self, config, data, name=None): - self.batch_size = batch_size = config.batch_size - self.num_steps = num_steps = config.num_steps - self.epoch_size = ((len(data) // batch_size) - 1) // num_steps - self.input_data, self.targets = reader.ptb_producer( - data, batch_size, num_steps, name=name) - - -class PTBModel(object): - """The PTB model.""" - - def __init__(self, is_training, config, input_): - self._input = input_ - - batch_size = input_.batch_size - num_steps = input_.num_steps - size = config.hidden_size - vocab_size = config.vocab_size - - # Slightly better results can be obtained with forget gate biases - # initialized to 1 but the hyperparameters of the model would need to be - # different than reported in the paper. - def lstm_cell(): - # With the latest TensorFlow source code (as of Mar 27, 2017), - # the BasicLSTMCell will need a reuse parameter which is unfortunately not - # defined in TensorFlow 1.0. To maintain backwards compatibility, we add - # an argument check here: - if 'reuse' in inspect.getargspec( - tf.contrib.rnn.BasicLSTMCell.__init__).args: - return tf.contrib.rnn.BasicLSTMCell( - size, forget_bias=0.0, state_is_tuple=True, - reuse=tf.get_variable_scope().reuse) - else: - return tf.contrib.rnn.BasicLSTMCell( - size, forget_bias=0.0, state_is_tuple=True) - attn_cell = lstm_cell - if is_training and config.keep_prob < 1: - def attn_cell(): - return tf.contrib.rnn.DropoutWrapper( - lstm_cell(), output_keep_prob=config.keep_prob) - cell = tf.contrib.rnn.MultiRNNCell( - [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) - - self._initial_state = cell.zero_state(batch_size, data_type()) - - - # first implement the less efficient version - test_word_in = tf.placeholder(tf.int32, [1, 1]) - test_word_out = tf.placeholder(tf.int32, [1, 1]) - test_input_state_c = tf.placeholder(tf.float32, [1, size]) - test_input_state_h = tf.placeholder(tf.float32, [1, size]) - test_input_state = tf.contrib.rnn.LSTMStateTuple(test_input_state_c, test_input_state_h) - -# print ("want to be", self._initial_state) -# print ("it actually is ", input_state) - with tf.device("/cpu:0"): - embedding = tf.get_variable( - "embedding", [vocab_size, size], dtype=data_type()) - -# print("should be ", input_.input_data) -# print("is ", test_word) - - inputs = tf.nn.embedding_lookup(embedding, input_.input_data) - test_inputs = tf.nn.embedding_lookup(embedding, test_word_in) -# print("should be ", inputs) -# print("is ", test_inputs) - - # test time - with tf.variable_scope("RNN"): -# tf.get_variable_scope().reuse_variables() - (test_cell_output, test_output_state) = cell(test_inputs[:, 0, :], [test_input_state]) - - softmax_w = tf.get_variable( - "softmax_w", [size, vocab_size], dtype=data_type()) - softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) - - test_logits = tf.matmul(test_cell_output, softmax_w) + softmax_b - test_softmaxed = tf.nn.softmax(test_logits) - print("test softmaxed is ", test_softmaxed) - p_word = test_softmaxed[0, test_word_out[0,0]] -# p_word = tf.float32(test_softmaxed[:, test_word_out], name="p_out") - - if is_training and config.keep_prob < 1: - inputs = tf.nn.dropout(inputs, config.keep_prob) - - # Simplified version of models/tutorials/rnn/rnn.py's rnn(). - # This builds an unrolled LSTM for tutorial purposes only. - # In general, use the rnn() or state_saving_rnn() from rnn.py. - # - # The alternative version of the code below is: - # - # inputs = tf.unstack(inputs, num=num_steps, axis=1) - # outputs, state = tf.contrib.rnn.static_rnn( - # cell, inputs, initial_state=self._initial_state) - outputs = [] - state = self._initial_state - with tf.variable_scope("RNN"): - for time_step in range(num_steps): - if time_step > -1: tf.get_variable_scope().reuse_variables() - (cell_output, state) = cell(inputs[:, time_step, :], state) - outputs.append(cell_output) - - output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) - logits = tf.matmul(output, softmax_w) + softmax_b - loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( - [logits], - [tf.reshape(input_.targets, [-1])], - [tf.ones([batch_size * num_steps], dtype=data_type())]) - self._cost = cost = tf.reduce_sum(loss) / batch_size - self._final_state = state - - if not is_training: - return - - self._lr = tf.Variable(0.0, trainable=False) - tvars = tf.trainable_variables() - grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), - config.max_grad_norm) - optimizer = tf.train.GradientDescentOptimizer(self._lr) - self._train_op = optimizer.apply_gradients( - zip(grads, tvars), - global_step=tf.contrib.framework.get_or_create_global_step()) - - self._new_lr = tf.placeholder( - tf.float32, shape=[], name="new_learning_rate") - self._lr_update = tf.assign(self._lr, self._new_lr) - - def assign_lr(self, session, lr_value): - session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) - - @property - def input(self): - return self._input - - @property - def initial_state(self): - return self._initial_state - - @property - def cost(self): - return self._cost - - @property - def final_state(self): - return self._final_state - - @property - def lr(self): - return self._lr - - @property - def train_op(self): - return self._train_op - - -class SmallConfig(object): - """Small config.""" - init_scale = 0.1 - learning_rate = 1.0 - max_grad_norm = 5 - num_layers = 2 - num_steps = 20 - hidden_size = 200 - max_epoch = 4 - max_max_epoch = 13 - keep_prob = 1.0 - lr_decay = 0.5 - batch_size = 20 - vocab_size = 10000 - - -class MediumConfig(object): - """Medium config.""" - init_scale = 0.05 - learning_rate = 1.0 - max_grad_norm = 5 - num_layers = 2 - num_steps = 35 - hidden_size = 650 - max_epoch = 6 - max_max_epoch = 39 - keep_prob = 0.5 - lr_decay = 0.8 - batch_size = 20 - vocab_size = 10000 - - -class LargeConfig(object): - """Large config.""" - init_scale = 0.04 - learning_rate = 1.0 - max_grad_norm = 10 - num_layers = 2 - num_steps = 35 - hidden_size = 1500 - max_epoch = 14 - max_max_epoch = 55 - keep_prob = 0.35 - lr_decay = 1 / 1.15 - batch_size = 20 - vocab_size = 10000 - - -class TestConfig(object): - """Tiny config, for testing.""" - init_scale = 0.1 - learning_rate = 1.0 - max_grad_norm = 1 - num_layers = 1 - num_steps = 2 - hidden_size = 2 - max_epoch = 1 - max_max_epoch = 1 - keep_prob = 1.0 - lr_decay = 0.5 - batch_size = 20 - vocab_size = 10000 - - -def run_epoch(session, model, eval_op=None, verbose=False): - """Runs the model on the given data.""" - start_time = time.time() - costs = 0.0 - iters = 0 - state = session.run(model.initial_state) - - fetches = { - "cost": model.cost, - "final_state": model.final_state, - } - if eval_op is not None: - fetches["eval_op"] = eval_op - - for step in range(model.input.epoch_size): - feed_dict = {} - for i, (c, h) in enumerate(model.initial_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - - vals = session.run(fetches, feed_dict) - cost = vals["cost"] - state = vals["final_state"] - - costs += cost - iters += model.input.num_steps - - if verbose and step % (model.input.epoch_size // 10) == 10: - print("%.3f perplexity: %.3f speed: %.0f wps" % - (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), - iters * model.input.batch_size / (time.time() - start_time))) - - return np.exp(costs / iters) - - -def get_config(): - if FLAGS.model == "small": - return SmallConfig() - elif FLAGS.model == "medium": - return MediumConfig() - elif FLAGS.model == "large": - return LargeConfig() - elif FLAGS.model == "test": - return TestConfig() - else: - raise ValueError("Invalid model: %s", FLAGS.model) - - -def main(_): - if not FLAGS.data_path: - raise ValueError("Must set --data_path to PTB data directory") - - raw_data = reader.ptb_raw_data(FLAGS.data_path) - train_data, valid_data, test_data, _ = raw_data - - config = get_config() - eval_config = get_config() - eval_config.batch_size = 1 - eval_config.num_steps = 1 - - with tf.Graph().as_default(): - initializer = tf.random_uniform_initializer(-config.init_scale, - config.init_scale) - - with tf.name_scope("Train"): - train_input = PTBInput(config=config, data=train_data, name="TrainInput") - with tf.variable_scope("Model", reuse=None, initializer=initializer): - m = PTBModel(is_training=True, config=config, input_=train_input) - tf.summary.scalar("Training Loss", m.cost) - tf.summary.scalar("Learning Rate", m.lr) - - with tf.name_scope("Valid"): - valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") - with tf.variable_scope("Model", reuse=True, initializer=initializer): - mvalid = PTBModel(is_training=False, config=config, input_=valid_input) - tf.summary.scalar("Validation Loss", mvalid.cost) - - with tf.name_scope("Test"): - test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") - with tf.variable_scope("Model", reuse=True, initializer=initializer): - mtest = PTBModel(is_training=False, config=eval_config, - input_=test_input) - - sv = tf.train.Supervisor(logdir=FLAGS.save_path) - with sv.managed_session() as session: - for i in range(config.max_max_epoch): - lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) - m.assign_lr(session, config.learning_rate * lr_decay) - - print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) - train_perplexity = run_epoch(session, m, eval_op=m.train_op, - verbose=True) - print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) - valid_perplexity = run_epoch(session, mvalid) - print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) - - test_perplexity = run_epoch(session, mtest) - print("Test Perplexity: %.3f" % test_perplexity) - - if FLAGS.save_path: - print("Saving model to %s." % FLAGS.save_path) - sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) - - -if __name__ == "__main__": - tf.app.run() diff --git a/src/tensorflow/tensorflow-rnnlm-lib.cc b/src/tensorflow/tensorflow-rnnlm-lib.cc new file mode 100644 index 00000000000..9db6645c3e3 --- /dev/null +++ b/src/tensorflow/tensorflow-rnnlm-lib.cc @@ -0,0 +1,195 @@ +// lm/kaldi-rnnlm.cc + +#include +#include + +#include "tensorflow/tensorflow-rnnlm-lib.h" +#include "util/stl-utils.h" +#include "util/text-utils.h" + +using tensorflow::Status; + +namespace kaldi { +using tf_rnnlm::KaldiTfRnnlmWrapper; +using tf_rnnlm::TfRnnlmDeterministicFst; +using std::ifstream; + +KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( + const KaldiTfRnnlmWrapperOpts &opts, + const std::string &rnn_wordlist, + const std::string &word_symbol_table_rxfilename, // TODO(hxu) will do this later + const std::string &unk_prob_rspecifier, + Session* session) { + session_ = session; + + fst::SymbolTable *fst_word_symbols = NULL; + if (!(fst_word_symbols = + fst::SymbolTable::ReadText(word_symbol_table_rxfilename))) { + KALDI_ERR << "Could not read symbol table from file " + << word_symbol_table_rxfilename; + } + + fst_label_to_word_.resize(fst_word_symbols->NumSymbols()); + + for (int32 i = 0; i < fst_label_to_word_.size(); ++i) { + fst_label_to_word_[i] = fst_word_symbols->Find(i); + if (fst_label_to_word_[i] == "") { + KALDI_ERR << "Could not find word for integer " << i << "in the word " + << "symbol table, mismatched symbol table or you have discoutinuous " + << "integers in your symbol table?"; + } + } + + fst_label_to_rnn_label_.resize(fst_word_symbols->NumSymbols(), -1); + + { // input. + ifstream ifile(rnn_wordlist.c_str()); + int id; + string word; + int i = 0; + while (ifile >> id >> word) { // TODO(hxu) ugly fix for cued-rnnlm's bug + // will implement a better fix later + if (word == "[UNK]") { + word = ""; + } else if (word == "") { + continue; + } + i++; + assert(i == id + 1); + rnn_label_to_word_.push_back(word); + + int fst_label = fst_word_symbols->Find(rnn_label_to_word_[i]); + KALDI_ASSERT(fst::SymbolTable::kNoSymbol != fst_label); + fst_label_to_rnn_label_[fst_label] = i; + } + bos_ = 1; + eos_ = 0; // TODO(hxu) + } + rnn_label_to_word_.push_back(""); + + for (int i = 0; i < fst_label_to_rnn_label_.size(); i++) { + if (fst_label_to_rnn_label_[i] == -1) { + fst_label_to_rnn_label_[i] = rnn_label_to_word_.size() - 1; + } + } + + +} + +BaseFloat KaldiTfRnnlmWrapper::GetLogProb( + int32 word, const std::vector &wseq, + const Tensor &context_in, + tensorflow::Tensor *context_out) { + + std::vector wseq_symbols(wseq.size()); + for (int32 i = 0; i < wseq_symbols.size(); ++i) { + KALDI_ASSERT(wseq[i] < label_to_word_.size()); + wseq_symbols[i] = label_to_word_[wseq[i]]; + } + + std::vector> inputs; + + Tensor lastword(tensorflow::DT_INT32, {1, 1}); + Tensor thisword(tensorflow::DT_INT32, {1, 1}); + + lastword.scalar()() = (wseq.size() == 0? bos_: wseq.back()); + thisword.scalar()() = word; + + inputs = { + {"Train/Model/test_word_in", lastword}, + {"Train/Model/test_word_out", thisword}, + {"Train/Model/test_state", context_in}, + }; + + // The session will initialize the outputs + std::vector outputs; + + // Run the session, evaluating our "c" operation from the graph + Status status = session_->Run(inputs, {"Train/Model/test_out", "Train/Model/test_state_out"}, {}, &outputs); + +// return rnnlm_.computeConditionalLogprob(label_to_word_[word], wseq_symbols, +// context_in, context_out); + if (context_out != NULL) + *context_out = outputs[1]; + return outputs[0].scalar()(); +} + +void KaldiTfRnnlmWrapper::GetInitialContext(Tensor *c) const { + std::vector state; + Status status = session_->Run(std::vector>(), {"Train/Model/test_initial_state"}, {}, &state); + *c = state[0]; +} + +TfRnnlmDeterministicFst::TfRnnlmDeterministicFst(int32 max_ngram_order, + KaldiTfRnnlmWrapper *rnnlm) { + KALDI_ASSERT(rnnlm != NULL); + max_ngram_order_ = max_ngram_order; + rnnlm_ = rnnlm; + + // Uses empty history for . + std::vector' | grep -v '' | \ + awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ + sort -nr > $dir/unigram.counts + +total_nwords=`wc -l $dir/unigram.counts | awk '{print $1}'` + +head -$nwords $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id + +tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts + +for type in train valid; do + mv $dir/$type.in $dir/$type +done + +# Now randomize the order of the training data. +cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \ + sort | cut -f 2 > $dir/foo +mv $dir/foo $dir/train + +# OK we'll train the RNNLM on this data. + +touch $dir/unk.probs # dummy file, not used for cued-rnnlm + +echo "data preparation finished" + diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh index c3070183619..ceac46b5eb9 100755 --- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh @@ -58,7 +58,7 @@ fi if [ "$rnnlm_ver" == "tensorflow" ]; then rescoring_binary="lattice-lmrescore-tf-rnnlm" - first_arg=$rnnlm_dir/wordlist.rnn + first_arg=$rnnlm_dir/wordlist.rnn.final fi oldlm=$oldlang/G.fst @@ -70,7 +70,7 @@ elif [ ! -f $oldlm ]; then fi [ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; -[ ! -f $rnnlm_dir/rnnlm ] && [ ! -d $rnnlm_dir/rnnlm ] && [ ! && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1; +[ ! -f $rnnlm_dir/rnnlm ] && [ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1; [ ! -f $rnnlm_dir/unk.probs ] &&\ echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1; [ ! -f $oldlang/words.txt ] &&\ diff --git a/src/tensorflow/tensorflow-rnnlm-lib.cc b/src/tensorflow/tensorflow-rnnlm-lib.cc index 529cf260fff..9fe46df339f 100644 --- a/src/tensorflow/tensorflow-rnnlm-lib.cc +++ b/src/tensorflow/tensorflow-rnnlm-lib.cc @@ -23,9 +23,8 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( const std::string &rnn_wordlist, const std::string &word_symbol_table_rxfilename, // TODO(hxu) will do this later const std::string &unk_prob_rspecifier, -// Session* session) { const std::string &tf_model_path) { -// session_ = session; + // read the tf model { string graph_path = tf_model_path + ".meta"; @@ -58,8 +57,14 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( KALDI_ERR << status.ToString(); } + // get the initial context + std::vector state; + session_->Run(std::vector>(), {"Train/Model/test_initial_state"}, {}, &state); + initial_context_ = state[0]; } +// GetInitialContext(&initial_context_); + fst::SymbolTable *fst_word_symbols = NULL; if (!(fst_word_symbols = fst::SymbolTable::ReadText(word_symbol_table_rxfilename))) { @@ -82,25 +87,22 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( num_total_words = fst_word_symbols->NumSymbols(); + oos_ = -1; { // input. ifstream ifile(rnn_wordlist.c_str()); int id; string word; int i = -1; - while (ifile >> word >> id) { // TODO(hxu) ugly fix for cued-rnnlm's bug - // will implement a better fix later -// if (word == "") { -// continue; -// } + while (ifile >> word >> id) { i++; assert(i == id); rnn_label_to_word_.push_back(word); int fst_label = fst_word_symbols->Find(rnn_label_to_word_[i]); if (fst::SymbolTable::kNoSymbol == fst_label) { - if (i < 2) continue; + if (i < 2) continue; // and - KALDI_ASSERT(word == ""); + KALDI_ASSERT(word == "" && oos_ == -1); oos_ = i; continue; } @@ -108,14 +110,15 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( fst_label_to_rnn_label_[fst_label] = i; } bos_ = 1; - eos_ = 0; // TODO(hxu) + eos_ = 0; // TODO(hxu) need to think carefully about these.. } + KALDI_ASSERT(oos_ != -1); // rnn_label_to_word_.push_back(""); num_rnn_words = rnn_label_to_word_.size(); for (int i = 0; i < fst_label_to_rnn_label_.size(); i++) { if (fst_label_to_rnn_label_[i] == -1) { - fst_label_to_rnn_label_[i] = rnn_label_to_word_.size() - 1; + fst_label_to_rnn_label_[i] = oos_; } } } @@ -123,14 +126,8 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( BaseFloat KaldiTfRnnlmWrapper::GetLogProb( int32 word, const std::vector &wseq, const Tensor &context_in, - tensorflow::Tensor *context_out) { - - std::vector wseq_symbols(wseq.size()); - for (int32 i = 0; i < wseq_symbols.size(); ++i) { - KALDI_ASSERT(wseq[i] < label_to_word_.size()); - wseq_symbols[i] = label_to_word_[wseq[i]]; - } - + Tensor *context_out) { + KALDI_ASSERT(word >= 0); std::vector> inputs; Tensor lastword(tensorflow::DT_INT32, {1, 1}); @@ -153,8 +150,10 @@ BaseFloat KaldiTfRnnlmWrapper::GetLogProb( // return rnnlm_.computeConditionalLogprob(label_to_word_[word], wseq_symbols, // context_in, context_out); - if (context_out != NULL) + if (context_out != NULL) { + KALDI_ASSERT(outputs.size() == 2); *context_out = outputs[1]; + } if (word != oos_) { return outputs[0].scalar()(); } else { @@ -162,10 +161,8 @@ BaseFloat KaldiTfRnnlmWrapper::GetLogProb( } } -void KaldiTfRnnlmWrapper::GetInitialContext(Tensor *c) const { - std::vector state; - Status status = session_->Run(std::vector>(), {"Train/Model/test_initial_state"}, {}, &state); - *c = state[0]; +const Tensor& KaldiTfRnnlmWrapper::GetInitialContext() const { + return initial_context_; } TfRnnlmDeterministicFst::TfRnnlmDeterministicFst(int32 max_ngram_order, @@ -178,8 +175,7 @@ TfRnnlmDeterministicFst::TfRnnlmDeterministicFst(int32 max_ngram_order, std::vector=g' \ | perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \ | gzip -c > $dir/all.gz @@ -37,7 +37,7 @@ gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data gunzip -c $dir/all.gz | tail -n +$heldout_sent > $dir/train.in # training data -cat $dir/train.in $dir/wordlist.all | grep -v '' | grep -v '' | \ +cat $dir/train.in $dir/wordlist.all | \ awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ sort -nr > $dir/unigram.counts @@ -48,12 +48,22 @@ head -$nwords $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn | a tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts for type in train valid; do - cat $dir/$type.in | awk -v w=$dir/wordlist.rnn 'BEGIN{while((getline0)d[$1]=1}{for(i=1;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' | sed "s=^= =g" | sed "s=$= =" > $dir/$type + cat $dir/$type.in | awk -v w=$dir/wordlist.rnn 'BEGIN{while((getline0)d[$1]=1}{for(i=1;i<=NF;i++){if(d[$i]==1){s=$i}else{s=""} printf("%s ",s)} print""}' > $dir/$type done # OK we'll train the RNNLM on this data. touch $dir/unk.probs # dummy file, not used for cued-rnnlm +cp $dir/wordlist.rnn $dir/wordlist.rnn.final + +has_oos=`grep "" $dir/wordlist.rnn.final | wc -l | awk '{print $1}'` +if [ $has_oos == "0" ]; then +# n=`wc -l $dir/wordlist.rnn.final | awk '{print $1}'` +# echo n is $n + echo "" >> $dir/wordlist.rnn.final +fi + + echo "data preparation finished" diff --git a/egs/ami/s5/local/tensorflow/reader.py b/egs/ami/s5/local/tensorflow/reader.py index 964a7b5e949..5458b93ea31 100644 --- a/egs/ami/s5/local/tensorflow/reader.py +++ b/egs/ami/s5/local/tensorflow/reader.py @@ -1,4 +1,5 @@ # Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# Modified by Hainan Xu to be used in Kaldi for lattice rescoring 2017 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +15,7 @@ # ============================================================================== -"""Utilities for parsing PTB text files.""" +"""Utilities for parsing RNNLM text files.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -24,23 +25,30 @@ import tensorflow as tf - - def _read_words(filename): with tf.gfile.GFile(filename, "r") as f: return f.read().decode("utf-8").split() # return f.read().decode("utf-8").replace("\n", "").split() - def _build_vocab(filename): - data = _read_words(filename) - - counter = collections.Counter(data) - count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) - - words, _ = list(zip(*count_pairs)) +# data = _read_words(filename) +# +# counter = collections.Counter(data) +# count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) +# +# words, _ = list(zip(*count_pairs)) +# word_to_id = dict(zip(words, range(len(words)))) + +# word_to_id = {} +# new_id = 0 +# with open(filename, "r") as f: +# for word in f: +# word_to_id[word] = new_id +# new_id = new_id + 1 +# return word_to_id + + words = _read_words(filename) word_to_id = dict(zip(words, range(len(words)))) - return word_to_id @@ -49,13 +57,13 @@ def _file_to_word_ids(filename, word_to_id): return [word_to_id[word] for word in data if word in word_to_id] -def ptb_raw_data(data_path=None): - """Load PTB raw data from data directory "data_path". +def rnnlm_raw_data(data_path, vocab_path): + """Load RNNLM raw data from data directory "data_path". - Reads PTB text files, converts strings to integer ids, + Reads RNNLM text files, converts strings to integer ids, and performs mini-batching of the inputs. - The PTB dataset comes from Tomas Mikolov's webpage: + The RNNLM dataset comes from Tomas Mikolov's webpage: http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz @@ -65,14 +73,14 @@ def ptb_raw_data(data_path=None): Returns: tuple (train_data, valid_data, test_data, vocabulary) - where each of the data objects can be passed to PTBIterator. + where each of the data objects can be passed to RNNLMIterator. """ train_path = os.path.join(data_path, "train") valid_path = os.path.join(data_path, "valid") # test_path = os.path.join(data_path, "eval.txt") - word_to_id = _build_vocab(train_path) + word_to_id = _build_vocab(vocab_path) train_data = _file_to_word_ids(train_path, word_to_id) valid_data = _file_to_word_ids(valid_path, word_to_id) # test_data = _file_to_word_ids(test_path, word_to_id) @@ -81,14 +89,14 @@ def ptb_raw_data(data_path=None): # return train_data, valid_data, test_data, vocabulary, word_to_id -def ptb_producer(raw_data, batch_size, num_steps, name=None): - """Iterate on the raw PTB data. +def rnnlm_producer(raw_data, batch_size, num_steps, name=None): + """Iterate on the raw RNNLM data. This chunks up raw_data into batches of examples and returns Tensors that are drawn from these batches. Args: - raw_data: one of the raw data outputs from ptb_raw_data. + raw_data: one of the raw data outputs from rnnlm_raw_data. batch_size: int, the batch size. num_steps: int, the number of unrolls. name: the name of this operation (optional). @@ -100,7 +108,7 @@ def ptb_producer(raw_data, batch_size, num_steps, name=None): Raises: tf.errors.InvalidArgumentError: if batch_size or num_steps are too high. """ - with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]): + with tf.name_scope(name, "RNNLMProducer", [raw_data, batch_size, num_steps]): raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32) data_len = tf.size(raw_data) diff --git a/egs/ami/s5/local/tensorflow/run.sh b/egs/ami/s5/local/tensorflow/run.sh index ac016200524..7e3150482cc 100755 --- a/egs/ami/s5/local/tensorflow/run.sh +++ b/egs/ami/s5/local/tensorflow/run.sh @@ -1,6 +1,6 @@ #!/bin/bash mic=ihm -ngram_order=3 +ngram_order=4 model_type=small stage=1 weight=0.5 @@ -11,29 +11,21 @@ weight=0.5 set -e -dir=data/new_tensorflow/$model_type +dir=data/tensorflow/$model_type mkdir -p $dir if [ $stage -le 1 ]; then - local/tensorflow/train_rnnlm.sh $dir + local/tensorflow/prep_data.sh $dir fi +mkdir -p $dir/ if [ $stage -le 2 ]; then - mkdir -p $dir/ - python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --wordlist_save_path=$dir/wordlist.rnn.final -fi - -has_oos=`grep "" $dir/wordlist.rnn.final | wc -l | awk '{print $1}'` -if [ $has_oos == "0" ]; then - n=`wc -l $dir/wordlist.rnn.final | awk '{print $1}'` - echo n is $n - echo " $n" >> $dir/wordlist.rnn.final + python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final fi final_lm=ami_fsh.o3g.kn LM=$final_lm.pr1-7 -date if [ $stage -le 3 ]; then # for decode_set in dev; do for decode_set in dev eval; do @@ -46,10 +38,9 @@ if [ $stage -le 3 ]; then --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ data/lang_$LM $dir \ data/$mic/${decode_set}_hires ${decode_dir} \ - ${decode_dir}.new.tfrnnlm.lat.${ngram_order}gram.$weight & + ${decode_dir}.tfrnnlm.lat.${ngram_order}gram.$weight & done fi wait -date diff --git a/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py b/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py index 642d7b01172..f6bfeae8b75 100644 --- a/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py +++ b/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py @@ -183,8 +183,8 @@ def attn_cell(): grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) # optimizer = tf.train.AdamOptimizer() # TODO - optimizer = tf.train.MomentumOptimizer(self._lr, 0.9) # TODO -# optimizer = tf.train.GradientDescentOptimizer(self._lr) # TODO +# optimizer = tf.train.MomentumOptimizer(self._lr, 0.9) # TODO + optimizer = tf.train.GradientDescentOptimizer(self._lr) # TODO self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) diff --git a/src/tensorflow/tensorflow-rnnlm-lib.cc b/src/tensorflow/tensorflow-rnnlm-lib.cc index b82a3476c52..b1d7ae7eaa1 100644 --- a/src/tensorflow/tensorflow-rnnlm-lib.cc +++ b/src/tensorflow/tensorflow-rnnlm-lib.cc @@ -85,27 +85,24 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( oos_ = -1; { // input. ifstream ifile(rnn_wordlist.c_str()); - int id; string word; - int i = -1; - while (ifile >> word >> id) { - i++; - assert(i == id); + int id = -1; + eos_ = 0; + while (ifile >> word) { + id++; rnn_label_to_word_.push_back(word); // vector[i] = word int fst_label = fst_word_symbols->Find(word); if (fst::SymbolTable::kNoSymbol == fst_label) { - if (i < 2) continue; // and + if (id == eos_) continue; KALDI_ASSERT(word == "" && oos_ == -1); - oos_ = i; + oos_ = id; continue; } KALDI_ASSERT(fst_label >= 0); - fst_label_to_rnn_label_[fst_label] = i; + fst_label_to_rnn_label_[fst_label] = id; } - bos_ = 1; - eos_ = 0; // TODO(hxu) need to think carefully about these.. } if (fst_label_to_word_.size() > rnn_label_to_word_.size()) { KALDI_ASSERT(oos_ != -1); @@ -137,7 +134,7 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( { std::vector state; Tensor bosword(tensorflow::DT_INT32, {1, 1}); - bosword.scalar()() = bos_; + bosword.scalar()() = eos_; // eos_ is more like a sentence boundary std::vector> inputs = { {"Train/Model/test_word_in", bosword}, diff --git a/src/tensorflow/tensorflow-rnnlm-lib.h b/src/tensorflow/tensorflow-rnnlm-lib.h index 5011b0b138a..5af2ee37a70 100644 --- a/src/tensorflow/tensorflow-rnnlm-lib.h +++ b/src/tensorflow/tensorflow-rnnlm-lib.h @@ -21,16 +21,13 @@ namespace tf_rnnlm { struct KaldiTfRnnlmWrapperOpts { std::string unk_symbol; - std::string bos_symbol; std::string eos_symbol; - KaldiTfRnnlmWrapperOpts() : unk_symbol(""), bos_symbol(""), eos_symbol("") {} + KaldiTfRnnlmWrapperOpts() : unk_symbol(""), eos_symbol("") {} void Register(OptionsItf *opts) { opts->Register("unk-symbol", &unk_symbol, "Symbol for out-of-vocabulary " "words in rnnlm."); - opts->Register("bos-symbol", &eos_symbol, "Beginning of setence symbol in " - "rnnlm."); opts->Register("eos-symbol", &eos_symbol, "End of setence symbol in " "rnnlm."); } @@ -50,7 +47,6 @@ class KaldiTfRnnlmWrapper { } int32 GetEos() const { return eos_; } - int32 GetBos() const { return bos_; } const Tensor& GetInitialContext() const; const Tensor& GetInitialCell() const; @@ -78,7 +74,6 @@ class KaldiTfRnnlmWrapper { Session* session_; // ptf owned here int32 eos_; - int32 bos_; int32 oos_; KALDI_DISALLOW_COPY_AND_ASSIGN(KaldiTfRnnlmWrapper); From 7ef2de4affb004f573c5e4edb9d44ceb41baff9e Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Tue, 20 Jun 2017 12:39:12 -0400 Subject: [PATCH 23/30] recipe draft finished --- egs/ami/s5/local/tensorflow/lstm.py | 5 ----- egs/ami/s5/local/tensorflow/run_vannila.sh | 2 +- egs/ami/s5/local/tensorflow/vanilla_rnnlm.py | 22 ++++++++------------ 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/egs/ami/s5/local/tensorflow/lstm.py b/egs/ami/s5/local/tensorflow/lstm.py index 590744ed4fb..8c6a0765e70 100644 --- a/egs/ami/s5/local/tensorflow/lstm.py +++ b/egs/ami/s5/local/tensorflow/lstm.py @@ -339,11 +339,6 @@ def main(_): raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path) train_data, valid_data, _, word_map = raw_data -# with open(FLAGS.wordlist_save_path, "w") as wmap_file: -# count_pairs = sorted(word_map.items(), key=lambda x: (x[1], x[0])) -# for k, v in count_pairs: -# wmap_file.write(str(k) + " " + str(v) + "\n") - config = get_config() config.vocab_size = len(word_map) eval_config = get_config() diff --git a/egs/ami/s5/local/tensorflow/run_vannila.sh b/egs/ami/s5/local/tensorflow/run_vannila.sh index bdef6f2c42d..71ecd7340ba 100755 --- a/egs/ami/s5/local/tensorflow/run_vannila.sh +++ b/egs/ami/s5/local/tensorflow/run_vannila.sh @@ -11,7 +11,7 @@ weight=0.5 set -e -dir=data/vannila_tensorflow/$model_type +dir=data/vannila_tensorflow_200/$model_type mkdir -p $dir if [ $stage -le 1 ]; then diff --git a/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py b/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py index f6bfeae8b75..2fe11222c73 100644 --- a/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py +++ b/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py @@ -183,8 +183,8 @@ def attn_cell(): grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) # optimizer = tf.train.AdamOptimizer() # TODO -# optimizer = tf.train.MomentumOptimizer(self._lr, 0.9) # TODO - optimizer = tf.train.GradientDescentOptimizer(self._lr) # TODO + optimizer = tf.train.MomentumOptimizer(self._lr, 0.9) # TODO +# optimizer = tf.train.GradientDescentOptimizer(self._lr) # TODO self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) @@ -237,15 +237,15 @@ class TestConfig(object): class SmallConfig(object): """Small config.""" init_scale = 0.1 - learning_rate = 0.1 - max_grad_norm = 0.5 - num_layers = 2 + learning_rate = 0.2 + max_grad_norm = 1 + num_layers = 1 num_steps = 20 hidden_size = 200 - max_epoch = 10 - max_max_epoch = 40 + max_epoch = 4 + max_max_epoch = 20 keep_prob = 1 - lr_decay = 0.5 + lr_decay = 0.95 batch_size = 64 class MediumConfig(object): @@ -330,11 +330,6 @@ def main(_): raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path) train_data, valid_data, _, word_map = raw_data -# with open(FLAGS.wordlist_save_path, "w") as wmap_file: -# count_pairs = sorted(word_map.items(), key=lambda x: (x[1], x[0])) -# for k, v in count_pairs: -# wmap_file.write(str(k) + " " + str(v) + "\n") - config = get_config() config.vocab_size = len(word_map) eval_config = get_config() @@ -362,6 +357,7 @@ def main(_): with sv.managed_session() as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) + m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) From 8787364ec9191f37f25dcb408f55144ed33b637b Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Tue, 20 Jun 2017 19:03:46 -0400 Subject: [PATCH 24/30] add new objf; still debugging --- egs/ami/s5/local/tensorflow/lstm_fast.py | 409 +++++++++++++++++++++++ egs/ami/s5/local/tensorflow/run.sh | 4 +- egs/ami/s5/local/tensorflow/run_fast.sh | 49 +++ 3 files changed, 460 insertions(+), 2 deletions(-) create mode 100644 egs/ami/s5/local/tensorflow/lstm_fast.py create mode 100755 egs/ami/s5/local/tensorflow/run_fast.sh diff --git a/egs/ami/s5/local/tensorflow/lstm_fast.py b/egs/ami/s5/local/tensorflow/lstm_fast.py new file mode 100644 index 00000000000..45533eee958 --- /dev/null +++ b/egs/ami/s5/local/tensorflow/lstm_fast.py @@ -0,0 +1,409 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# Modified by Hainan Xu to be used in Kaldi for lattice rescoring 2017 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +sys.path.insert(0,"/home/hxu/.local/lib/python2.7/site-packages/") + +import inspect +import time + +import numpy as np +import tensorflow as tf + +import reader + +flags = tf.flags +logging = tf.logging + +flags.DEFINE_string( + "model", "small", + "A type of model. Possible options are: small, medium, large.") +flags.DEFINE_string("data_path", None, + "Where the training/test data is stored.") +flags.DEFINE_string("vocab_path", None, + "Where the wordlist file is stored.") +flags.DEFINE_string("save_path", None, + "Model output directory.") +flags.DEFINE_bool("use_fp16", False, + "Train using 16-bit floats instead of 32bit floats") + +FLAGS = flags.FLAGS + + +def data_type(): + return tf.float16 if FLAGS.use_fp16 else tf.float32 + +def new_softmax(labels, logits): +# logits = -logits; +# logits = tf.nn.relu(logits) +# logits = -logits; +# print (labels, logits) + logits = tf.minimum(logits, 0) + target = tf.reshape(labels, [-1]) + exp_logits = tf.exp(logits) + row_sums = tf.reduce_sum(exp_logits, 1) # this is the negative part of the objf +# print (sums) + + t2 = tf.expand_dims(target, 1) + range = tf.expand_dims(tf.range(tf.shape(target)[0]), 1) + ind = tf.concat([range, t2], 1) + res = tf.gather_nd(logits, ind) +# print (res) +# positive_part = tf.reduce_sum(res, 1) +# print (positive_part) + + return -res + row_sums - 1 +# return -res + tf.log(row_sums) # this is the original softmax + +class RNNLMInput(object): + """The input data.""" + + def __init__(self, config, data, name=None): + self.batch_size = batch_size = config.batch_size + self.num_steps = num_steps = config.num_steps + self.epoch_size = ((len(data) // batch_size) - 1) // num_steps + self.input_data, self.targets = reader.rnnlm_producer( + data, batch_size, num_steps, name=name) + + +class RNNLMModel(object): + """The RNNLM model.""" + + def __init__(self, is_training, config, input_): + self._input = input_ + + batch_size = input_.batch_size + num_steps = input_.num_steps + size = config.hidden_size + vocab_size = config.vocab_size + + # Slightly better results can be obtained with forget gate biases + # initialized to 1 but the hyperparameters of the model would need to be + # different than reported in the paper. + def lstm_cell(): + # With the latest TensorFlow source code (as of Mar 27, 2017), + # the BasicLSTMCell will need a reuse parameter which is unfortunately not + # defined in TensorFlow 1.0. To maintain backwards compatibility, we add + # an argument check here: + if 'reuse' in inspect.getargspec( + tf.contrib.rnn.BasicLSTMCell.__init__).args: + return tf.contrib.rnn.BasicLSTMCell( + size, forget_bias=0.0, state_is_tuple=True, + reuse=tf.get_variable_scope().reuse) + else: + return tf.contrib.rnn.BasicLSTMCell( + size, forget_bias=0.0, state_is_tuple=True) + attn_cell = lstm_cell + if is_training and config.keep_prob < 1: + def attn_cell(): + return tf.contrib.rnn.DropoutWrapper( + lstm_cell(), output_keep_prob=config.keep_prob) + self.cell = tf.contrib.rnn.MultiRNNCell( + [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) + + self._initial_state = self.cell.zero_state(batch_size, data_type()) + self._initial_state_single = self.cell.zero_state(1, data_type()) + + self.initial = tf.reshape(tf.stack(axis=0, values=self._initial_state_single), [config.num_layers, 2, 1, size], name="test_initial_state") + + + # first implement the less efficient version + test_word_in = tf.placeholder(tf.int32, [1, 1], name="test_word_in") + + state_placeholder = tf.placeholder(tf.float32, [config.num_layers, 2, 1, size], name="test_state_in") + # unpacking the input state context + l = tf.unstack(state_placeholder, axis=0) + test_input_state = tuple( + [tf.contrib.rnn.LSTMStateTuple(l[idx][0],l[idx][1]) + for idx in range(config.num_layers)] + ) + + with tf.device("/cpu:0"): + self.embedding = tf.get_variable( + "embedding", [vocab_size, size], dtype=data_type()) + + inputs = tf.nn.embedding_lookup(self.embedding, input_.input_data) + test_inputs = tf.nn.embedding_lookup(self.embedding, test_word_in) + + # test time + with tf.variable_scope("RNN"): + (test_cell_output, test_output_state) = self.cell(test_inputs[:, 0, :], test_input_state) + + test_state_out = tf.reshape(tf.stack(axis=0, values=test_output_state), [config.num_layers, 2, 1, size], name="test_state_out") + test_cell_out = tf.reshape(test_cell_output, [1, size], name="test_cell_out") + # above is the first part of the graph for test + # test-word-in + # > ---- > test-state-out + # test-state-in > test-cell-out + + + # below is the 2nd part of the graph for test + # test-word-out + # > prob(word | test-word-out) + # test-cell-in + + test_word_out = tf.placeholder(tf.int32, [1, 1], name="test_word_out") + cellout_placeholder = tf.placeholder(tf.float32, [1, size], name="test_cell_in") + + softmax_w = tf.get_variable( + "softmax_w", [size, vocab_size], dtype=data_type()) + softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) + + test_logits = tf.matmul(cellout_placeholder, softmax_w) + softmax_b + test_softmaxed = tf.nn.softmax(test_logits) + + p_word = test_softmaxed[0, test_word_out[0,0]] + test_out = tf.identity(p_word, name="test_out") + + if is_training and config.keep_prob < 1: + inputs = tf.nn.dropout(inputs, config.keep_prob) + + # Simplified version of models/tutorials/rnn/rnn.py's rnn(). + # This builds an unrolled LSTM for tutorial purposes only. + # In general, use the rnn() or state_saving_rnn() from rnn.py. + # + # The alternative version of the code below is: + # + # inputs = tf.unstack(inputs, num=num_steps, axis=1) + # outputs, state = tf.contrib.rnn.static_rnn( + # cell, inputs, initial_state=self._initial_state) + outputs = [] + state = self._initial_state + with tf.variable_scope("RNN"): + for time_step in range(num_steps): + if time_step > -1: tf.get_variable_scope().reuse_variables() + (cell_output, state) = self.cell(inputs[:, time_step, :], state) + outputs.append(cell_output) + + output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) + logits = tf.matmul(output, softmax_w) + softmax_b + loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( + [logits], + [tf.reshape(input_.targets, [-1])], + [tf.ones([batch_size * num_steps], dtype=data_type())], + softmax_loss_function=new_softmax) + self._cost = cost = tf.reduce_sum(loss) / batch_size + self._final_state = state + + if not is_training: + return + + self._lr = tf.Variable(0.0, trainable=False) + tvars = tf.trainable_variables() + grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), + config.max_grad_norm) + optimizer = tf.train.GradientDescentOptimizer(self._lr) + self._train_op = optimizer.apply_gradients( + zip(grads, tvars), + global_step=tf.contrib.framework.get_or_create_global_step()) + + self._new_lr = tf.placeholder( + tf.float32, shape=[], name="new_learning_rate") + self._lr_update = tf.assign(self._lr, self._new_lr) + + def assign_lr(self, session, lr_value): + session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) + + @property + def input(self): + return self._input + + @property + def initial_state(self): + return self._initial_state + + @property + def cost(self): + return self._cost + + @property + def final_state(self): + return self._final_state + + @property + def lr(self): + return self._lr + + @property + def train_op(self): + return self._train_op + +class TestConfig(object): + """Tiny config, for testing.""" + init_scale = 0.1 + learning_rate = 1.0 + max_grad_norm = 1 + num_layers = 1 + num_steps = 2 + hidden_size = 2 + max_epoch = 1 + max_max_epoch = 1 + keep_prob = 1.0 + lr_decay = 0.5 + batch_size = 20 + +class SmallConfig(object): + """Small config.""" + init_scale = 0.1 + learning_rate = 1.0 + max_grad_norm = 5 + num_layers = 2 + num_steps = 20 + hidden_size = 200 + max_epoch = 4 + max_max_epoch = 13 + keep_prob = 1.0 + lr_decay = 0.5 + batch_size = 64 + + +class MediumConfig(object): + """Medium config.""" + init_scale = 0.05 + learning_rate = 1.0 + max_grad_norm = 5 + num_layers = 2 + num_steps = 35 + hidden_size = 650 + max_epoch = 6 + max_max_epoch = 39 + keep_prob = 0.5 + lr_decay = 0.8 + batch_size = 20 + + +class LargeConfig(object): + """Large config.""" + init_scale = 0.04 + learning_rate = 1.0 + max_grad_norm = 10 + num_layers = 2 + num_steps = 35 + hidden_size = 1500 + max_epoch = 14 + max_max_epoch = 55 + keep_prob = 0.35 + lr_decay = 1 / 1.15 + batch_size = 20 + + + +def run_epoch(session, model, eval_op=None, verbose=False): + """Runs the model on the given data.""" + start_time = time.time() + costs = 0.0 + iters = 0 + state = session.run(model.initial_state) + + fetches = { + "cost": model.cost, + "final_state": model.final_state, + } + if eval_op is not None: + fetches["eval_op"] = eval_op + + for step in range(model.input.epoch_size): + feed_dict = {} + for i, (c, h) in enumerate(model.initial_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + + vals = session.run(fetches, feed_dict) + cost = vals["cost"] + state = vals["final_state"] + + + costs += cost + iters += model.input.num_steps + + if verbose and step % (model.input.epoch_size // 10) == 10: + print ("cost is ", costs) + print ("avg cost is ", costs / iters) + print("%.3f perplexity: %.3f speed: %.0f wps" % + (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), + iters * model.input.batch_size / (time.time() - start_time))) + + return np.exp(costs / iters) + + +def get_config(): + if FLAGS.model == "small": + return SmallConfig() + elif FLAGS.model == "medium": + return MediumConfig() + elif FLAGS.model == "large": + return LargeConfig() + elif FLAGS.model == "test": + return TestConfig() + else: + raise ValueError("Invalid model: %s", FLAGS.model) + + +def main(_): + if not FLAGS.data_path: + raise ValueError("Must set --data_path to RNNLM data directory") + + raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path) + train_data, valid_data, _, word_map = raw_data + + config = get_config() + config.vocab_size = len(word_map) + eval_config = get_config() + eval_config.batch_size = 1 + eval_config.num_steps = 1 + + with tf.Graph().as_default(): + initializer = tf.random_uniform_initializer(-config.init_scale, + config.init_scale) + + with tf.name_scope("Train"): + train_input = RNNLMInput(config=config, data=train_data, name="TrainInput") + with tf.variable_scope("Model", reuse=None, initializer=initializer): + m = RNNLMModel(is_training=True, config=config, input_=train_input) + tf.summary.scalar("Training Loss", m.cost) + tf.summary.scalar("Learning Rate", m.lr) + + with tf.name_scope("Valid"): + valid_input = RNNLMInput(config=config, data=valid_data, name="ValidInput") + with tf.variable_scope("Model", reuse=True, initializer=initializer): + mvalid = RNNLMModel(is_training=False, config=config, input_=valid_input) + tf.summary.scalar("Validation Loss", mvalid.cost) + + sv = tf.train.Supervisor(logdir=FLAGS.save_path) + with sv.managed_session() as session: + for i in range(config.max_max_epoch): + lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) + m.assign_lr(session, config.learning_rate * lr_decay) + + print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) + train_perplexity = run_epoch(session, m, eval_op=m.train_op, + verbose=True) + + print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) + valid_perplexity = run_epoch(session, mvalid) + print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) + + if FLAGS.save_path: + print("Saving model to %s." % FLAGS.save_path) + sv.saver.save(session, FLAGS.save_path) + +if __name__ == "__main__": + tf.app.run() diff --git a/egs/ami/s5/local/tensorflow/run.sh b/egs/ami/s5/local/tensorflow/run.sh index 7e3150482cc..52989a73ca6 100755 --- a/egs/ami/s5/local/tensorflow/run.sh +++ b/egs/ami/s5/local/tensorflow/run.sh @@ -1,7 +1,7 @@ #!/bin/bash mic=ihm ngram_order=4 -model_type=small +model_type=test stage=1 weight=0.5 @@ -20,7 +20,7 @@ fi mkdir -p $dir/ if [ $stage -le 2 ]; then - python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final + $decode_cmd $dir/train.log python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final fi final_lm=ami_fsh.o3g.kn diff --git a/egs/ami/s5/local/tensorflow/run_fast.sh b/egs/ami/s5/local/tensorflow/run_fast.sh new file mode 100755 index 00000000000..f0d3753ff58 --- /dev/null +++ b/egs/ami/s5/local/tensorflow/run_fast.sh @@ -0,0 +1,49 @@ +#!/bin/bash +mic=ihm +ngram_order=4 +model_type=small +stage=1 +weight=0.5 + +. ./utils/parse_options.sh +. ./cmd.sh +. ./path.sh + +set -e + +dir=data/auto_tensorflow/$model_type +mkdir -p $dir + +if [ $stage -le 1 ]; then + local/tensorflow/prep_data.sh $dir +fi + +mkdir -p $dir/ +if [ $stage -le 2 ]; then + python local/tensorflow/lstm_fast.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final +# $decode_cmd $dir/train.log python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final +fi + +exit + +final_lm=ami_fsh.o3g.kn +LM=$final_lm.pr1-7 + +if [ $stage -le 3 ]; then +# for decode_set in dev; do + for decode_set in dev eval; do + basedir=exp/$mic/nnet3/tdnn_sp/ + decode_dir=${basedir}/decode_${decode_set} + + # Lattice rescoring + steps/lmrescore_rnnlm_lat.sh \ + --cmd "$tensorflow_cmd --mem 16G" \ + --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ + data/lang_$LM $dir \ + data/$mic/${decode_set}_hires ${decode_dir} \ + ${decode_dir}.tfrnnlm.lat.${ngram_order}gram.$weight & + + done +fi + +wait From f83c0063071972a3cdce44de0a897deee4ab9513 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Wed, 21 Jun 2017 12:58:16 -0400 Subject: [PATCH 25/30] new objf working --- egs/ami/s5/local/tensorflow/lstm.py | 2 +- egs/ami/s5/local/tensorflow/lstm_fast.py | 29 +++++++++----------- egs/ami/s5/local/tensorflow/run.sh | 3 +- egs/ami/s5/local/tensorflow/run_fast.sh | 6 ++-- egs/ami/s5/local/tensorflow/vanilla_rnnlm.py | 2 +- src/tensorflow/tensorflow-rnnlm-lib.cc | 11 ++++++-- 6 files changed, 27 insertions(+), 26 deletions(-) diff --git a/egs/ami/s5/local/tensorflow/lstm.py b/egs/ami/s5/local/tensorflow/lstm.py index 8c6a0765e70..1aba92b129b 100644 --- a/egs/ami/s5/local/tensorflow/lstm.py +++ b/egs/ami/s5/local/tensorflow/lstm.py @@ -147,7 +147,7 @@ def attn_cell(): softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) test_logits = tf.matmul(cellout_placeholder, softmax_w) + softmax_b - test_softmaxed = tf.nn.softmax(test_logits) + test_softmaxed = tf.nn.log_softmax(test_logits) p_word = test_softmaxed[0, test_word_out[0,0]] test_out = tf.identity(p_word, name="test_out") diff --git a/egs/ami/s5/local/tensorflow/lstm_fast.py b/egs/ami/s5/local/tensorflow/lstm_fast.py index 45533eee958..836ec6d45b8 100644 --- a/egs/ami/s5/local/tensorflow/lstm_fast.py +++ b/egs/ami/s5/local/tensorflow/lstm_fast.py @@ -51,24 +51,23 @@ def data_type(): return tf.float16 if FLAGS.use_fp16 else tf.float32 +def f(x): + x1 = tf.minimum(0.0, x) + + x2 = tf.maximum(0.0, x) + + return tf.exp(x1) + x2 + def new_softmax(labels, logits): -# logits = -logits; -# logits = tf.nn.relu(logits) -# logits = -logits; -# print (labels, logits) logits = tf.minimum(logits, 0) target = tf.reshape(labels, [-1]) - exp_logits = tf.exp(logits) - row_sums = tf.reduce_sum(exp_logits, 1) # this is the negative part of the objf -# print (sums) + f_logits = f(logits) + row_sums = tf.reduce_sum(f_logits, 1) # this is the negative part of the objf t2 = tf.expand_dims(target, 1) range = tf.expand_dims(tf.range(tf.shape(target)[0]), 1) ind = tf.concat([range, t2], 1) res = tf.gather_nd(logits, ind) -# print (res) -# positive_part = tf.reduce_sum(res, 1) -# print (positive_part) return -res + row_sums - 1 # return -res + tf.log(row_sums) # this is the original softmax @@ -166,11 +165,11 @@ def attn_cell(): softmax_w = tf.get_variable( "softmax_w", [size, vocab_size], dtype=data_type()) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) + softmax_b = softmax_b - 9.0 - test_logits = tf.matmul(cellout_placeholder, softmax_w) + softmax_b - test_softmaxed = tf.nn.softmax(test_logits) + test_logits = tf.matmul(cellout_placeholder, tf.transpose(tf.nn.embedding_lookup(tf.transpose(softmax_w), test_word_out[0]))) + softmax_b[test_word_out[0,0]] - p_word = test_softmaxed[0, test_word_out[0,0]] + p_word = test_logits[0, 0] test_out = tf.identity(p_word, name="test_out") if is_training and config.keep_prob < 1: @@ -263,7 +262,7 @@ class TestConfig(object): class SmallConfig(object): """Small config.""" init_scale = 0.1 - learning_rate = 1.0 + learning_rate = 1 max_grad_norm = 5 num_layers = 2 num_steps = 20 @@ -335,8 +334,6 @@ def run_epoch(session, model, eval_op=None, verbose=False): iters += model.input.num_steps if verbose and step % (model.input.epoch_size // 10) == 10: - print ("cost is ", costs) - print ("avg cost is ", costs / iters) print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), iters * model.input.batch_size / (time.time() - start_time))) diff --git a/egs/ami/s5/local/tensorflow/run.sh b/egs/ami/s5/local/tensorflow/run.sh index 52989a73ca6..b1aa2d06614 100755 --- a/egs/ami/s5/local/tensorflow/run.sh +++ b/egs/ami/s5/local/tensorflow/run.sh @@ -1,7 +1,7 @@ #!/bin/bash mic=ihm ngram_order=4 -model_type=test +model_type=small stage=1 weight=0.5 @@ -27,7 +27,6 @@ final_lm=ami_fsh.o3g.kn LM=$final_lm.pr1-7 if [ $stage -le 3 ]; then -# for decode_set in dev; do for decode_set in dev eval; do basedir=exp/$mic/nnet3/tdnn_sp/ decode_dir=${basedir}/decode_${decode_set} diff --git a/egs/ami/s5/local/tensorflow/run_fast.sh b/egs/ami/s5/local/tensorflow/run_fast.sh index f0d3753ff58..890119a7006 100755 --- a/egs/ami/s5/local/tensorflow/run_fast.sh +++ b/egs/ami/s5/local/tensorflow/run_fast.sh @@ -20,12 +20,10 @@ fi mkdir -p $dir/ if [ $stage -le 2 ]; then - python local/tensorflow/lstm_fast.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final + $decode_cmd $dir/train.log python local/tensorflow/lstm_fast.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final # $decode_cmd $dir/train.log python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final fi -exit - final_lm=ami_fsh.o3g.kn LM=$final_lm.pr1-7 @@ -41,7 +39,7 @@ if [ $stage -le 3 ]; then --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ data/lang_$LM $dir \ data/$mic/${decode_set}_hires ${decode_dir} \ - ${decode_dir}.tfrnnlm.lat.${ngram_order}gram.$weight & + ${decode_dir}.fast.tfrnnlm.lat.${ngram_order}gram.$weight & done fi diff --git a/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py b/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py index 2fe11222c73..6e5c72f6adb 100644 --- a/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py +++ b/egs/ami/s5/local/tensorflow/vanilla_rnnlm.py @@ -141,7 +141,7 @@ def attn_cell(): softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) test_logits = tf.matmul(cellout_placeholder, softmax_w) + softmax_b - test_softmaxed = tf.nn.softmax(test_logits) + test_softmaxed = tf.nn.log_softmax(test_logits) p_word = test_softmaxed[0, test_word_out[0,0]] test_out = tf.identity(p_word, name="test_out") diff --git a/src/tensorflow/tensorflow-rnnlm-lib.cc b/src/tensorflow/tensorflow-rnnlm-lib.cc index b1d7ae7eaa1..6c84ded5702 100644 --- a/src/tensorflow/tensorflow-rnnlm-lib.cc +++ b/src/tensorflow/tensorflow-rnnlm-lib.cc @@ -203,10 +203,16 @@ BaseFloat KaldiTfRnnlmWrapper::GetLogProb( float ans; if (word != oos_) { - ans = log(outputs[0].scalar()()); + ans = outputs[0].scalar()(); } else { - ans = log(outputs[0].scalar()() / (num_total_words - num_rnn_words)); + ans = outputs[0].scalar()() - log (num_total_words - num_rnn_words); } + +// if (word != oos_) { +// ans = log(outputs[0].scalar()()); +// } else { +// ans = log(outputs[0].scalar()() / (num_total_words - num_rnn_words)); +// } // std::ostringstream his_str; // for (int i = 0; i < wseq.size(); i++) { // his_str << rnn_label_to_word_[wseq[i]] << "(" << wseq[i] << ") "; @@ -214,6 +220,7 @@ BaseFloat KaldiTfRnnlmWrapper::GetLogProb( // KALDI_LOG << "Computing logprob of word " << rnn_label_to_word_[word] << "(" << word << ")" // << " given history " << his_str.str() << " is " << exp(ans); +// KALDI_LOG << "prob is " << outputs[0].scalar()(); return ans; } From beeb56c6f3a733e516a330a595ccbbb85d9d7996 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Thu, 22 Jun 2017 14:29:06 -0400 Subject: [PATCH 26/30] fix small issue --- egs/ami/s5/local/tensorflow/lstm_fast.py | 11 +- egs/ami/s5/local/tensorflow/run_fast.sh | 6 +- src/tensorflow/tensorflow-rnnlm-lib.cc | 149 +++++++++++------------ src/tensorflow/tensorflow-rnnlm-lib.h | 22 ++-- src/tfbin/lattice-lmrescore-tf-rnnlm.cc | 2 +- 5 files changed, 95 insertions(+), 95 deletions(-) diff --git a/egs/ami/s5/local/tensorflow/lstm_fast.py b/egs/ami/s5/local/tensorflow/lstm_fast.py index 836ec6d45b8..e5b7bcc91a2 100644 --- a/egs/ami/s5/local/tensorflow/lstm_fast.py +++ b/egs/ami/s5/local/tensorflow/lstm_fast.py @@ -51,17 +51,18 @@ def data_type(): return tf.float16 if FLAGS.use_fp16 else tf.float32 +# this function does the following: +# return exp(x) if x < 0 +# x if x >= 0 def f(x): x1 = tf.minimum(0.0, x) - x2 = tf.maximum(0.0, x) - return tf.exp(x1) + x2 def new_softmax(labels, logits): - logits = tf.minimum(logits, 0) target = tf.reshape(labels, [-1]) - f_logits = f(logits) + f_logits = tf.exp(logits) +# f_logits = f(logits) row_sums = tf.reduce_sum(f_logits, 1) # this is the negative part of the objf t2 = tf.expand_dims(target, 1) @@ -270,7 +271,7 @@ class SmallConfig(object): max_epoch = 4 max_max_epoch = 13 keep_prob = 1.0 - lr_decay = 0.5 + lr_decay = 0.8 batch_size = 64 diff --git a/egs/ami/s5/local/tensorflow/run_fast.sh b/egs/ami/s5/local/tensorflow/run_fast.sh index 890119a7006..86007258d41 100755 --- a/egs/ami/s5/local/tensorflow/run_fast.sh +++ b/egs/ami/s5/local/tensorflow/run_fast.sh @@ -11,7 +11,7 @@ weight=0.5 set -e -dir=data/auto_tensorflow/$model_type +dir=data/fast_tensorflow/$model_type mkdir -p $dir if [ $stage -le 1 ]; then @@ -20,8 +20,8 @@ fi mkdir -p $dir/ if [ $stage -le 2 ]; then - $decode_cmd $dir/train.log python local/tensorflow/lstm_fast.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final -# $decode_cmd $dir/train.log python local/tensorflow/rnnlm.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final + python local/tensorflow/lstm_fast.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final +# $decode_cmd $dir/train.log python local/tensorflow/lstm_fast.py --data_path=$dir --model=$model_type --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final fi final_lm=ami_fsh.o3g.kn diff --git a/src/tensorflow/tensorflow-rnnlm-lib.cc b/src/tensorflow/tensorflow-rnnlm-lib.cc index 6c84ded5702..4e2d6bc6695 100644 --- a/src/tensorflow/tensorflow-rnnlm-lib.cc +++ b/src/tensorflow/tensorflow-rnnlm-lib.cc @@ -1,4 +1,5 @@ -// lm/kaldi-rnnlm.cc +// Copyright 2017 Hainan Xu +// wrapper for tensorflow rnnlm #include #include @@ -11,54 +12,52 @@ #include "util/stl-utils.h" #include "util/text-utils.h" -using tensorflow::Status; - namespace kaldi { +using std::ifstream; using tf_rnnlm::KaldiTfRnnlmWrapper; using tf_rnnlm::TfRnnlmDeterministicFst; -using std::ifstream; +using tensorflow::Status; -KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( - const KaldiTfRnnlmWrapperOpts &opts, - const std::string &rnn_wordlist, - const std::string &word_symbol_table_rxfilename, - const std::string &unk_prob_rspecifier, - const std::string &tf_model_path) { - // read the tf model - { - string graph_path = tf_model_path + ".meta"; +void KaldiTfRnnlmWrapper::ReadTfModel(const std::string &tf_model_path) { + string graph_path = tf_model_path + ".meta"; - Status status = tensorflow::NewSession(tensorflow::SessionOptions(), &session_); - if (!status.ok()) { - KALDI_ERR << status.ToString(); - } + Status status = tensorflow::NewSession(tensorflow::SessionOptions(), &session_); + if (!status.ok()) { + KALDI_ERR << status.ToString(); + } - tensorflow::MetaGraphDef graph_def; - status = tensorflow::ReadBinaryProto(tensorflow::Env::Default(), graph_path, &graph_def); - if (!status.ok()) { - KALDI_ERR << status.ToString(); - } + tensorflow::MetaGraphDef graph_def; + status = tensorflow::ReadBinaryProto(tensorflow::Env::Default(), graph_path, &graph_def); + if (!status.ok()) { + KALDI_ERR << status.ToString(); + } - // Add the graph to the session - status = session_->Create(graph_def.graph_def()); - if (!status.ok()) { - KALDI_ERR << status.ToString(); - } + // Add the graph to the session + status = session_->Create(graph_def.graph_def()); + if (!status.ok()) { + KALDI_ERR << status.ToString(); + } - Tensor checkpointPathTensor(tensorflow::DT_STRING, tensorflow::TensorShape()); - checkpointPathTensor.scalar()() = tf_model_path; - - status = session_->Run( - {{ graph_def.saver_def().filename_tensor_name(), checkpointPathTensor },}, - {}, - {graph_def.saver_def().restore_op_name()}, - nullptr); - if (!status.ok()) { - KALDI_ERR << status.ToString(); - } + Tensor checkpointPathTensor(tensorflow::DT_STRING, tensorflow::TensorShape()); + checkpointPathTensor.scalar()() = tf_model_path; + + status = session_->Run( + {{ graph_def.saver_def().filename_tensor_name(), checkpointPathTensor },}, + {}, + {graph_def.saver_def().restore_op_name()}, + nullptr); + if (!status.ok()) { + KALDI_ERR << status.ToString(); } +} -// GetInitialContext(&initial_context_); +KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( + const KaldiTfRnnlmWrapperOpts &opts, + const std::string &rnn_wordlist, + const std::string &word_symbol_table_rxfilename, + const std::string &unk_prob_rspecifier, + const std::string &tf_model_path): opts_(opts) { + ReadTfModel(tf_model_path); fst::SymbolTable *fst_word_symbols = NULL; if (!(fst_word_symbols = @@ -79,9 +78,9 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( } fst_label_to_rnn_label_.resize(fst_word_symbols->NumSymbols(), -1); - num_total_words = fst_word_symbols->NumSymbols(); + // read rnn wordlist and then generate ngram-label-to-rnn-label map oos_ = -1; { // input. ifstream ifile(rnn_wordlist.c_str()); @@ -94,9 +93,12 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( int fst_label = fst_word_symbols->Find(word); if (fst::SymbolTable::kNoSymbol == fst_label) { - if (id == eos_) continue; - - KALDI_ASSERT(word == "" && oos_ == -1); + if (id == eos_) { + KALDI_ASSERT(word == opts_.eos_symbol); + continue; + } +// KALDI_LOG << word << " " << opts_.unk_symbol << " " << oos_; + KALDI_ASSERT(word == opts_.unk_symbol && oos_ == -1); oos_ = id; continue; } @@ -107,9 +109,9 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( if (fst_label_to_word_.size() > rnn_label_to_word_.size()) { KALDI_ASSERT(oos_ != -1); } -// rnn_label_to_word_.push_back(""); num_rnn_words = rnn_label_to_word_.size(); + // we must have a oos symbol in the wordlist if (oos_ == -1) { return; } @@ -119,34 +121,36 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( } } + AcquireInitialTensors(); +} + +void KaldiTfRnnlmWrapper::AcquireInitialTensors() { + Status status; + // get the initial context { - Status status; - // get the initial context - { - std::vector state; - status = session_->Run(std::vector>(), {"Train/Model/test_initial_state"}, {}, &state); - if (!status.ok()) { - KALDI_ERR << status.ToString(); - } - initial_context_ = state[0]; + std::vector state; + status = session_->Run(std::vector>(), {"Train/Model/test_initial_state"}, {}, &state); + if (!status.ok()) { + KALDI_ERR << status.ToString(); } + initial_context_ = state[0]; + } - { - std::vector state; - Tensor bosword(tensorflow::DT_INT32, {1, 1}); - bosword.scalar()() = eos_; // eos_ is more like a sentence boundary + { + std::vector state; + Tensor bosword(tensorflow::DT_INT32, {1, 1}); + bosword.scalar()() = eos_; // eos_ is more like a sentence boundary - std::vector> inputs = { - {"Train/Model/test_word_in", bosword}, - {"Train/Model/test_state_in", initial_context_}, - }; + std::vector> inputs = { + {"Train/Model/test_word_in", bosword}, + {"Train/Model/test_state_in", initial_context_}, + }; - status = session_->Run(inputs, {"Train/Model/test_cell_out"}, {}, &state); - if (!status.ok()) { - KALDI_ERR << status.ToString(); - } - initial_cell_ = state[0]; + status = session_->Run(inputs, {"Train/Model/test_cell_out"}, {}, &state); + if (!status.ok()) { + KALDI_ERR << status.ToString(); } + initial_cell_ = state[0]; } } @@ -171,11 +175,9 @@ BaseFloat KaldiTfRnnlmWrapper::GetLogProb( {"Train/Model/test_word_out", thisword}, {"Train/Model/test_state_in", context_in}, {"Train/Model/test_cell_in", cell_in}, -// {"Train/Model/test_cell_in", cell_in}, }; // The session will initialize the outputs - // Run the session, evaluating our "c" operation from the graph Status status = session_->Run(inputs, {"Train/Model/test_out", @@ -208,16 +210,6 @@ BaseFloat KaldiTfRnnlmWrapper::GetLogProb( ans = outputs[0].scalar()() - log (num_total_words - num_rnn_words); } -// if (word != oos_) { -// ans = log(outputs[0].scalar()()); -// } else { -// ans = log(outputs[0].scalar()() / (num_total_words - num_rnn_words)); -// } -// std::ostringstream his_str; -// for (int i = 0; i < wseq.size(); i++) { -// his_str << rnn_label_to_word_[wseq[i]] << "(" << wseq[i] << ") "; -// } - // KALDI_LOG << "Computing logprob of word " << rnn_label_to_word_[word] << "(" << word << ")" // << " given history " << his_str.str() << " is " << exp(ans); // KALDI_LOG << "prob is " << outputs[0].scalar()(); @@ -240,7 +232,6 @@ TfRnnlmDeterministicFst::TfRnnlmDeterministicFst(int32 max_ngram_order, // Uses empty history for . std::vector") {} + KaldiTfRnnlmWrapperOpts() : unk_symbol(""), eos_symbol("") {} void Register(OptionsItf *opts) { opts->Register("unk-symbol", &unk_symbol, "Symbol for out-of-vocabulary " @@ -40,7 +40,6 @@ class KaldiTfRnnlmWrapper { const std::string &word_symbol_table_rxfilename, const std::string &unk_prob_rspecifier, const std::string &tf_model_path); -// Session* session); ~KaldiTfRnnlmWrapper() { session_->Close(); @@ -48,18 +47,21 @@ class KaldiTfRnnlmWrapper { int32 GetEos() const { return eos_; } + // get an all-zero Tensor of the size that matches the hidden state of the TF model const Tensor& GetInitialContext() const; + + // get the 2nd-to-last layer of RNN when feeding input of + // (initial-context, sentence-boundary) const Tensor& GetInitialCell() const; // compute p(word | wseq) and return the log of that // the computation used the input cell, // which is the 2nd-to-last layer of the RNNLM associated with history wseq; // - // and we generate (context_out, new_cell) by passing (context_in, word) into the nnet + // and we generate (context_out, new_cell) by passing (context_in, word) into the model BaseFloat GetLogProb(int32 word, -/// const std::vector &wseq, - const Tensor &context_in, - const Tensor &cell_in, + const Tensor &context_in, // context to pass into RNN + const Tensor &cell_in, // 2nd-to-last layer Tensor *context_out, Tensor *new_cell); @@ -67,12 +69,18 @@ class KaldiTfRnnlmWrapper { std::vector rnn_label_to_word_; std::vector fst_label_to_word_; private: + void ReadTfModel(const std::string &tf_model_path); + + // do queries on the session to get the initial tensors (cell + context) + void AcquireInitialTensors(); + + KaldiTfRnnlmWrapperOpts opts_; Tensor initial_context_; Tensor initial_cell_; int32 num_total_words; int32 num_rnn_words; - Session* session_; // ptf owned here + Session* session_; // owned here int32 eos_; int32 oos_; diff --git a/src/tfbin/lattice-lmrescore-tf-rnnlm.cc b/src/tfbin/lattice-lmrescore-tf-rnnlm.cc index 0278759151f..171654f7efb 100644 --- a/src/tfbin/lattice-lmrescore-tf-rnnlm.cc +++ b/src/tfbin/lattice-lmrescore-tf-rnnlm.cc @@ -38,7 +38,7 @@ int main(int argc, char *argv[]) { "composing with the wrapped LM using a special type of composition\n" "algorithm. Determinization will be applied on the composed lattice.\n" "\n" - "Usage: lattice-lmrescore-rnnlm [options] \\\n" + "Usage: lattice-lmrescore-tf-rnnlm [options] \\\n" " \\\n" " \n" " e.g.: lattice-lmrescore-rnnlm --lm-scale=-1.0 words.txt \\\n" From 023f2baf71f2541415b9a0c7365906be023f5ab0 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Fri, 23 Jun 2017 14:28:59 -0400 Subject: [PATCH 27/30] add better handling of OOS words --- egs/ami/s5/local/tensorflow/prep_data.sh | 2 +- egs/ami/s5/local/tensorflow/run_fast.sh | 6 +++-- egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh | 2 +- src/tensorflow/tensorflow-rnnlm-lib.cc | 34 ++++++++++++++++++++++-- src/tensorflow/tensorflow-rnnlm-lib.h | 5 +++- src/tfbin/lattice-lmrescore-tf-rnnlm.cc | 28 +++++++++++-------- 6 files changed, 59 insertions(+), 18 deletions(-) diff --git a/egs/ami/s5/local/tensorflow/prep_data.sh b/egs/ami/s5/local/tensorflow/prep_data.sh index a763aaf15bd..49825781c7c 100755 --- a/egs/ami/s5/local/tensorflow/prep_data.sh +++ b/egs/ami/s5/local/tensorflow/prep_data.sh @@ -53,7 +53,7 @@ done # OK we'll train the RNNLM on this data. -touch $dir/unk.probs # dummy file, not used for cued-rnnlm +cat $dir/unk_class.counts | awk '{print $2, $1}' > $dir/unk.probs # dummy file, not used for cued-rnnlm cp $dir/wordlist.rnn $dir/wordlist.rnn.final diff --git a/egs/ami/s5/local/tensorflow/run_fast.sh b/egs/ami/s5/local/tensorflow/run_fast.sh index 86007258d41..629a7e064fc 100755 --- a/egs/ami/s5/local/tensorflow/run_fast.sh +++ b/egs/ami/s5/local/tensorflow/run_fast.sh @@ -1,6 +1,6 @@ #!/bin/bash mic=ihm -ngram_order=4 +ngram_order=3 model_type=small stage=1 weight=0.5 @@ -27,6 +27,7 @@ fi final_lm=ami_fsh.o3g.kn LM=$final_lm.pr1-7 +date if [ $stage -le 3 ]; then # for decode_set in dev; do for decode_set in dev eval; do @@ -39,9 +40,10 @@ if [ $stage -le 3 ]; then --rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \ data/lang_$LM $dir \ data/$mic/${decode_set}_hires ${decode_dir} \ - ${decode_dir}.fast.tfrnnlm.lat.${ngram_order}gram.$weight & + ${decode_dir}.unk.fast.tfrnnlm.lat.${ngram_order}gram.$weight & done fi wait +date diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh index ceac46b5eb9..d3e6ca73dd4 100755 --- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh @@ -58,7 +58,7 @@ fi if [ "$rnnlm_ver" == "tensorflow" ]; then rescoring_binary="lattice-lmrescore-tf-rnnlm" - first_arg=$rnnlm_dir/wordlist.rnn.final + first_arg="$first_arg $rnnlm_dir/wordlist.rnn.final" fi oldlm=$oldlang/G.fst diff --git a/src/tensorflow/tensorflow-rnnlm-lib.cc b/src/tensorflow/tensorflow-rnnlm-lib.cc index 4e2d6bc6695..7513b9207a7 100644 --- a/src/tensorflow/tensorflow-rnnlm-lib.cc +++ b/src/tensorflow/tensorflow-rnnlm-lib.cc @@ -18,6 +18,28 @@ using tf_rnnlm::KaldiTfRnnlmWrapper; using tf_rnnlm::TfRnnlmDeterministicFst; using tensorflow::Status; +void SetUnkPenalties(const string &filename, const fst::SymbolTable& fst_word_symbols, + std::vector *out) { + if (filename == "") + return; + out->resize(fst_word_symbols.NumSymbols(), 0); // default is 0 + ifstream ifile(filename.c_str()); + string word; + float count, total_count = 0; + while (ifile >> word >> count) { + int id = fst_word_symbols.Find(word); + KALDI_ASSERT(id != fst::SymbolTable::kNoSymbol); + (*out)[id] = count; + total_count += count; + } + + for (int i = 0; i < out->size(); i++) { + if ((*out)[i] != 0) { + (*out)[i] = log ((*out)[i] / total_count); + } + } +} + void KaldiTfRnnlmWrapper::ReadTfModel(const std::string &tf_model_path) { string graph_path = tf_model_path + ".meta"; @@ -55,7 +77,7 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( const KaldiTfRnnlmWrapperOpts &opts, const std::string &rnn_wordlist, const std::string &word_symbol_table_rxfilename, - const std::string &unk_prob_rspecifier, + const std::string &unk_prob_file, const std::string &tf_model_path): opts_(opts) { ReadTfModel(tf_model_path); @@ -122,6 +144,7 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( } AcquireInitialTensors(); + SetUnkPenalties(unk_prob_file, *fst_word_symbols, &unk_probs_); } void KaldiTfRnnlmWrapper::AcquireInitialTensors() { @@ -156,6 +179,7 @@ void KaldiTfRnnlmWrapper::AcquireInitialTensors() { BaseFloat KaldiTfRnnlmWrapper::GetLogProb( int32 word, + int32 fst_word, // const std::vector &wseq, const Tensor &context_in, const Tensor &cell_in, @@ -207,7 +231,11 @@ BaseFloat KaldiTfRnnlmWrapper::GetLogProb( if (word != oos_) { ans = outputs[0].scalar()(); } else { - ans = outputs[0].scalar()() - log (num_total_words - num_rnn_words); + if (unk_probs_.size() == 0) { + ans = outputs[0].scalar()() - log (num_total_words - num_rnn_words); + } else { + ans = outputs[0].scalar()() + unk_probs_[fst_word]; + } } // KALDI_LOG << "Computing logprob of word " << rnn_label_to_word_[word] << "(" << word << ")" @@ -249,6 +277,7 @@ fst::StdArc::Weight TfRnnlmDeterministicFst::Final(StateId s) { std::vector