Lattice-to-sequence (neulab#547)

msperber · web-flow · commit 8ee8fd5ed82c · 2018-11-19T19:39:38.000+01:00
* add __getitem__ and get_unpadded_sent to Sentence

* started integrating / updating lattices

* series of bug fixes to lattice encoder

* fixed config file and serializable interface

* added documentation

* removed last remaining from_spec from preproc code

* WIP: added LatticeFromPlfExtractor

* extracting lattices from PLF works

* implement lattice reader

* config file with lattice reader working

* removed some specialized code

* remove broken arc dropout

* move Lattice class to sent module

* moved lattice reader to input_readers

* simplified lattice embedder by delegating to base embedder

* simplify config

* moved lattice embedder to embedders module

* move lattice lstm out of specialized encoders package

* minor cleanup

* add link

* fix inconsistency in preproc code

* remove unused config file

* add Lattice.__len__

* simplified code by passing on expr seqs instead of lattices

* lattice plotting

* remove legacy comment

* made lattice plotting more flexible

* fix lattice.reversed()

* remove unused fields

* lattice padding

* fix access to bwd prob

* prepared LatticeBiasedMlpAttender

* finished LatticeBiasedMlpAttender

* text_input feature for LatticeReader

* fix reading in of bwd probs

* unpadded sent handling for lattice

* 'flatten' option for lattice reader

* remove duplicated classes
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -11,7 +11,7 @@ Prerequisites
 Before running *xnmt* you must install the required packages, including Python bindings for
 `DyNet <https://github.com/clab/dynet>`_.
 This can be done by running ``pip install -r requirements.txt``.
-(There is also ``requirements-extra.txt`` that has some requirements for utility scripts that are not part of *xnmt* itself.)
+(There are also optional package requirements under ``requirements-extra/`` for features that are non-central to *xnmt*.)
 
 Next, install *xnmt* by running ``python setup.py install`` for normal usage or ``python setup.py develop`` for
 development.
diff --git a/examples/05_preproc.yaml b/examples/05_preproc.yaml
@@ -60,9 +60,9 @@
       - '{DATA_OUT}/train.tok.norm.filter.ja'
       - '{DATA_OUT}/train.tok.norm.filter.en'
       specs:
-      - type: length
-        min: 1
-        max: 60
+      - !SentenceFiltererLength
+        min_all: 1
+        max_all: 60
     - !PreprocVocab
       in_files:
       - '{DATA_OUT}/train.tok.norm.ja'
diff --git a/examples/data/fisher_dev.en b/examples/data/fisher_dev.en
@@ -0,0 +1,5 @@
+afternoon .
+good afternoon
+my name is carmen , in chicago . you ?
+oh , my name is ricardo .
+of
diff --git a/examples/data/fisher_dev.en.vocab b/examples/data/fisher_dev.en.vocab
@@ -0,0 +1,15 @@
+afternoon
+carmen
+chicago
+good
+in
+is
+my
+name
+of
+oh
+ricardo
+you
+,
+?
+.
diff --git a/examples/data/fisher_dev.es.plf b/examples/data/fisher_dev.es.plf
@@ -0,0 +1,5 @@
+((('tal', -0.727828979, 1),('tardes', -2.55085754, 2),('tarde', -0.823196411, 2),),(('ves', -2.08010864, 1),('vez', -0.731903076, 1),('de', -0.931167603, 1),),)
+((('buenas', 0, 1),),(('tardes', 0, 1),),)
+((('mi', 0, 1),),(('nombre', 0, 1),),(('es', 0, 1),),(('carmen', 0, 1),),(('de', 0, 1),),(('chicago', 0, 1),),(('y', 0, 1),),(('tu', 0, 1),),)
+((('no', -0.760681152, 1),('o', -1.75738525, 5),('oh', -1.02124023, 7),),(('me', 0, 1),),(('no', 0, 1),),(('me', 0, 1),),(('ricardo', 0, 11),),(('me', 0, 1),),(('no', 0, 3),),(('me', -0.817199707, 1),('mi', -0.582763672, 4),),(('no', 0, 1),),(('me', 0, 1),),(('ricardo', 0, 5),),(('nombre', -0.5859375, 1),('no', -0.813262939, 2),),(('ricardo', 0, 3),),(('me', 0, 1),),(('ricardo', 0, 1),),)
+((('yea', -2.66070557, 1),('sí', -1.18453979, 1),('ya', -1.33209229, 1),('yeah', -1.02085876, 1),),)
diff --git a/examples/data/fisher_dev.es.vocab b/examples/data/fisher_dev.es.vocab
@@ -0,0 +1,23 @@
+buenas
+carmen
+chicago
+de
+es
+me
+mi
+no
+nombre
+o
+oh
+ricardo
+sí
+tal
+tarde
+tardes
+tu
+ves
+vez
+y
+ya
+yea
+yeah
diff --git a/requirements-extra/lattice.txt b/requirements-extra/lattice.txt
@@ -0,0 +1 @@
+graphviz
diff --git a/test/config/lattice.yaml b/test/config/lattice.yaml
@@ -0,0 +1,60 @@
+lattice: !Experiment
+  exp_global: !ExpGlobal
+    default_layer_dim: 32
+    dropout: 0.3
+  preproc: !PreprocRunner
+    overwrite: False
+    tasks:
+    - !PreprocExtract
+      in_files:
+      - examples/data/fisher_dev.es.plf
+      out_files:
+      - examples/output/fisher_dev.es.xlat
+      specs: !LatticeFromPlfExtractor {}
+  model: !DefaultTranslator
+    src_embedder: !SimpleWordEmbedder {}
+    encoder: !BiLatticeLSTMTransducer
+      layers: 2
+    attender: !LatticeBiasedMlpAttender {}
+    trg_embedder: !SimpleWordEmbedder {}
+    decoder: !AutoRegressiveDecoder
+      rnn: !UniLSTMSeqTransducer
+        layers: 1
+      transform: !AuxNonLinear
+        output_dim: 512
+        activation: 'tanh'
+      bridge: !CopyBridge {}
+      scorer: !Softmax {}
+    src_reader: !LatticeReader
+      vocab: !Vocab
+        vocab_file: examples/data/fisher_dev.es.vocab
+    trg_reader: !PlainTextReader
+      vocab: !Vocab
+        _xnmt_id: trg_vocab
+        vocab_file: examples/data/fisher_dev.en.vocab
+  train: !SimpleTrainingRegimen
+    trainer: !AdamTrainer
+      alpha: 0.0003
+    run_for_epochs: 10
+    batcher: !SrcBatcher
+      batch_size: 1
+    restart_trainer: True
+    lr_decay: 0.8
+    patience: 5
+    src_file: examples/output/fisher_dev.es.xlat
+    trg_file: examples/data/fisher_dev.en
+    dev_tasks:
+      - !AccuracyEvalTask
+        eval_metrics: bleu
+        src_file: examples/output/fisher_dev.es.xlat
+        ref_file: examples/data/fisher_dev.en
+        hyp_file: examples/output/{EXP}.dev_hyp
+      - !LossEvalTask
+        src_file: examples/output/fisher_dev.es.xlat
+        ref_file: examples/data/fisher_dev.en
+  evaluate:
+    - !AccuracyEvalTask
+      eval_metrics: bleu
+      src_file: examples/output/fisher_dev.es.xlat
+      ref_file: examples/data/fisher_dev.en
+      hyp_file: examples/output/{EXP}.test_hyp
diff --git a/test/config/preproc.yaml b/test/config/preproc.yaml
@@ -36,9 +36,9 @@ standard-preproc: !Experiment
       - test/tmp/head.tok.norm.filter.ja
       - test/tmp/head.tok.norm.filter.en
       specs:
-      - type: length
-        min: 1
-        max: 50
+      - !SentenceFiltererLength
+        min_all: 1
+        max_all: 50
     - !PreprocVocab
       in_files:
       - test/tmp/head.tok.norm.ja
diff --git a/test/test_run.py b/test/test_run.py
@@ -31,6 +31,9 @@ def test_ensembling(self):
   def test_forced(self):
     run.main(["test/config/forced.yaml"])
 
+  def test_lattice(self):
+    run.main(["test/config/lattice.yaml"])
+
   def test_lm(self):
     run.main(["test/config/lm.yaml"])
 
diff --git a/xnmt/__init__.py b/xnmt/__init__.py
@@ -51,6 +51,7 @@
 import xnmt.train.regimens
 import xnmt.train.tasks
 import xnmt.transducers.convolution
+import xnmt.transducers.lattice
 import xnmt.transducers.network_in_network
 import xnmt.transducers.positional
 import xnmt.transducers.pyramidal
diff --git a/xnmt/input_readers.py b/xnmt/input_readers.py
@@ -1,6 +1,6 @@
+import ast
 from itertools import zip_longest
 from functools import lru_cache
-import ast
 from typing import Iterator, Optional, Sequence, Union
 import numbers
 
@@ -468,6 +468,75 @@ def read_sent(self, line, idx):
   def read_sents(self, filename, filter_ids=None):
     return [l for l in self.iterate_filtered(filename, filter_ids)]
 
+
+class LatticeReader(BaseTextReader, Serializable):
+  """
+  Reads lattices from a text file.
+
+  The expected lattice file format is as follows:
+  * 1 line per lattice
+  * lines are serialized python lists / tuples
+  * 2 lists per lattice:
+    - list of nodes, with every node a 4-tuple: (lexicon_entry, fwd_log_prob, marginal_log_prob, bwd_log_prob)
+    - list of arcs, each arc a tuple: (node_id_start, node_id_end)
+            - node_id references the nodes and is 0-indexed
+            - node_id_start < node_id_end
+  * All paths must share a common start and end node, i.e. <s> and </s> need to be contained in the lattice
+
+  A simple example lattice:
+    [('<s>', 0.0, 0.0, 0.0), ('buenas', 0, 0.0, 0.0), ('tardes', 0, 0.0, 0.0), ('</s>', 0.0, 0.0, 0.0)],[(0, 1), (1, 2), (2, 3)]
+
+  Args:
+    vocab: Vocabulary to convert string tokens to integer ids. If not given, plain text will be assumed to contain
+           space-separated integer ids.
+    text_input: If ``True``, assume a standard text file as input and convert it to a flat lattice.
+    flatten: If ``True``, convert to a flat lattice, with all probabilities set to 1.
+  """
+  yaml_tag = '!LatticeReader'
+
+  @serializable_init
+  def __init__(self, vocab: Vocab, text_input: bool = False, flatten = False):
+    self.vocab = vocab
+    self.text_input = text_input
+    self.flatten = flatten
+
+  def read_sent(self, line, idx):
+    if self.text_input:
+      nodes = [sent.LatticeNode(nodes_prev=[], nodes_next=[1], value=Vocab.SS,
+                                fwd_log_prob=0.0, marginal_log_prob=0.0, bwd_log_prob=0.0)]
+      for word in line.strip().split():
+        nodes.append(
+          sent.LatticeNode(nodes_prev=[len(nodes)-1], nodes_next=[len(nodes)+1], value=self.vocab.convert(word),
+                           fwd_log_prob=0.0, marginal_log_prob=0.0, bwd_log_prob=0.0))
+      nodes.append(
+        sent.LatticeNode(nodes_prev=[len(nodes) - 1], nodes_next=[], value=Vocab.ES,
+                         fwd_log_prob=0.0, marginal_log_prob=0.0, bwd_log_prob=0.0))
+
+    else:
+      node_list, arc_list = ast.literal_eval(line)
+      nodes = [sent.LatticeNode(nodes_prev=[], nodes_next=[],
+                                value=self.vocab.convert(item[0]),
+                                fwd_log_prob=item[1], marginal_log_prob=item[2], bwd_log_prob=item[3])
+               for item in node_list]
+      if self.flatten:
+        for node_i in range(len(nodes)):
+          if node_i < len(nodes)-1: nodes[node_i].nodes_next.append(node_i+1)
+          if node_i > 0: nodes[node_i].nodes_prev.append(node_i-1)
+          nodes[node_i].fwd_log_prob = nodes[node_i].bwd_log_prob = nodes[node_i].marginal_log_prob = 0.0
+      else:
+        for from_index, to_index in arc_list:
+          nodes[from_index].nodes_next.append(to_index)
+          nodes[to_index].nodes_prev.append(from_index)
+
+      assert nodes[0].value == self.vocab.SS
+      assert nodes[-1].value == self.vocab.ES
+
+    return sent.Lattice(idx=idx, nodes=nodes, vocab=self.vocab)
+
+  def vocab_size(self):
+    return len(self.vocab)
+
+
 ###### A utility function to read a parallel corpus
 def read_parallel_corpus(src_reader: InputReader,
                          trg_reader: InputReader,
diff --git a/xnmt/modelparts/attenders.py b/xnmt/modelparts/attenders.py
@@ -1,10 +1,11 @@
 import math
 import numbers
 
+import numpy as np
 import dynet as dy
 
 from xnmt import logger
-from xnmt import batchers, expression_seqs, param_collections, param_initializers
+from xnmt import batchers, expression_seqs, events, param_collections, param_initializers
 from xnmt.persistence import serializable_init, Serializable, Ref, bare
 
 class Attender(object):
@@ -203,4 +204,56 @@ def calc_context(self, state: dy.Expression) -> dy.Expression:
     attention = self.calc_attention(state)
     return self.I * attention
 
+class LatticeBiasedMlpAttender(MlpAttender, Serializable):
+  """
+  Modified MLP attention, where lattices are assumed as input and the attention is biased toward confident nodes.
+
+  Args:
+    input_dim: input dimension
+    state_dim: dimension of state inputs
+    hidden_dim: hidden MLP dimension
+    param_init: how to initialize weight matrices
+    bias_init: how to initialize bias vectors
+    truncate_dec_batches: whether the decoder drops batch elements as soon as these are masked at some time step.
+  """
+
+  yaml_tag = '!LatticeBiasedMlpAttender'
+
+  @events.register_xnmt_handler
+  @serializable_init
+  def __init__(self,
+               input_dim: numbers.Integral = Ref("exp_global.default_layer_dim"),
+               state_dim: numbers.Integral = Ref("exp_global.default_layer_dim"),
+               hidden_dim: numbers.Integral = Ref("exp_global.default_layer_dim"),
+               param_init: param_initializers.ParamInitializer = Ref("exp_global.param_init", default=bare(param_initializers.GlorotInitializer)),
+               bias_init: param_initializers.ParamInitializer = Ref("exp_global.bias_init", default=bare(param_initializers.ZeroInitializer)),
+               truncate_dec_batches: bool = Ref("exp_global.truncate_dec_batches", default=False)) -> None:
+    super().__init__(input_dim=input_dim, state_dim=state_dim, hidden_dim=hidden_dim, param_init=param_init,
+                     bias_init=bias_init, truncate_dec_batches=truncate_dec_batches)
+
+  @events.handle_xnmt_event
+  def on_start_sent(self, src):
+    self.cur_sent_bias = np.full((src.sent_len(), 1, src.batch_size()), -1e10)
+    for batch_i, lattice_batch_elem in enumerate(src):
+      for node_i, node in enumerate(lattice_batch_elem.nodes):
+        self.cur_sent_bias[node_i, 0, batch_i] = node.marginal_log_prob
+    self.cur_sent_bias_expr = None
+
+  def calc_attention(self, state):
+    V = dy.parameter(self.pV)
+    U = dy.parameter(self.pU)
+
+    WI = self.WI
+    curr_sent_mask = self.curr_sent.mask
+    if self.truncate_dec_batches:
+      if curr_sent_mask: state, WI, curr_sent_mask = batchers.truncate_batches(state, WI, curr_sent_mask)
+      else: state, WI = batchers.truncate_batches(state, WI)
+    h = dy.tanh(dy.colwise_add(WI, V * state))
+    scores = dy.transpose(U * h)
+    if curr_sent_mask is not None:
+      scores = curr_sent_mask.add_to_tensor_expr(scores, multiplicator = -1e10)
+    if self.cur_sent_bias_expr is None: self.cur_sent_bias_expr = dy.inputTensor(self.cur_sent_bias, batched=True)
+    normalized = dy.softmax(scores + self.cur_sent_bias_expr)
+    self.attention_vecs.append(normalized)
+    return normalized
 
diff --git a/xnmt/preproc.py b/xnmt/preproc.py
diff --git a/xnmt/sent.py b/xnmt/sent.py
diff --git a/xnmt/transducers/lattice.py b/xnmt/transducers/lattice.py

-Original file line number
+Diff line change
 +afternoon
 +carmen
 +chicago
 +good
 +in
 +is
 +my
 +name
 +of
 +oh
 +ricardo
 +you
 +,
 +?
 +.