Clean up stochastic discrete video model by moving code and using discretization layers.

Lukasz Kaiser · Copybara-Service · commit ca628e4fcb04 · 2018-10-16T14:30:25.000-07:00
PiperOrigin-RevId: 217386874
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
@@ -3019,8 +3019,10 @@ def get_res():
     if is_xla_compiled():
       return get_res()
     else:
-      return tf.cond(
-          tf.less(tf.train.get_global_step(), steps), get_res, lambda: x1)
+      cur_step = tf.train.get_global_step()
+      if cur_step is None:
+        return x1  # Step not available, probably eval mode, don't mix.
+      return tf.cond(tf.less(cur_step, steps), get_res, lambda: x1)
 
 
 def brelu(x):
diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
@@ -783,6 +783,81 @@ def discrete_bottleneck(inputs,
   return outputs_dense, outputs_discrete, extra_loss, embed_fn, neg_q_entropy
 
 
+def predict_bits_with_lstm(prediction_source, state_size, total_num_bits,
+                           target_bits=None, bits_at_once=8, temperature=1.0):
+  """Predict a sequence of bits (a latent) with LSTM, both training and infer.
+
+  Given a tensor on which the predictions are based (prediction_source), we use
+  a single-layer LSTM with state of size state_size to predict total_num_bits,
+  which we predict in groups of size bits_at_once. During training, we use
+  target_bits as input to the LSTM (teacher forcing) and return the target_bits
+  together with the prediction loss. During inference, we sample with the given
+  temperature and return the predicted sequence and loss 0.
+
+  Args:
+    prediction_source: a Tensor of shape [batch_size, ...] used to create
+      the initial state and the first input to the LSTM.
+    state_size: python integer, the size of the LSTM state.
+    total_num_bits: python integer, how many bits in total to predict.
+    target_bits: a tensor of shape [batch_size, total_num_bits] used during
+      training as the target to predict; each element should be -1 or 1.
+    bits_at_once: pytho integer, how many bits to predict at once.
+    temperature: python float, temperature used for sampling during inference.
+
+  Returns:
+    a pair (bits, loss) with the predicted bit sequence, which is a Tensor of
+    shape [batch_size, total_num_bits] with elements either -1 or 1, and a loss
+    used to train the predictions against the provided target_bits.
+  """
+
+  with tf.variable_scope("predict_bits_with_lstm"):
+    # Layers and cell state creation.
+    lstm_cell = tf.contrib.rnn.LSTMCell(state_size)
+    discrete_predict = tf.layers.Dense(2**bits_at_once, name="discrete_predict")
+    discrete_embed = tf.layers.Dense(state_size, name="discrete_embed")
+    batch_size = common_layers.shape_list(prediction_source)[0]
+    layer_pred = tf.layers.flatten(prediction_source)
+    prediction = tf.layers.dense(layer_pred, state_size, name="istate")
+    c_state = tf.layers.dense(layer_pred, state_size, name="cstate")
+    m_state = tf.layers.dense(layer_pred, state_size, name="mstate")
+    state = (c_state, m_state)
+
+    # Prediction mode if no targets are given.
+    if target_bits is None:
+      outputs = []
+      for i in range(total_num_bits // bits_at_once):
+        output, state = lstm_cell(prediction, state)
+        discrete_logits = discrete_predict(output)
+        discrete_samples = common_layers.sample_with_temperature(
+            discrete_logits, temperature)
+        outputs.append(tf.expand_dims(discrete_samples, axis=1))
+        prediction = discrete_embed(tf.one_hot(discrete_samples, 256))
+      outputs = tf.concat(outputs, axis=1)
+      outputs = int_to_bit(outputs, bits_at_once)
+      outputs = tf.reshape(outputs, [batch_size, total_num_bits])
+      return 2 * outputs - 1, 0.0
+
+    # Training mode, calculating loss.
+    assert total_num_bits % bits_at_once == 0
+    d_pred = tf.reshape(tf.maximum(tf.stop_gradient(target_bits), 0), [
+        batch_size, total_num_bits // bits_at_once, bits_at_once])
+    d_int = bit_to_int(d_pred, bits_at_once)
+    tf.summary.histogram("target_integers", tf.reshape(d_int, [-1]))
+    d_hot = tf.one_hot(d_int, 2**bits_at_once, axis=-1)
+    d_pred = discrete_embed(d_hot)
+    pred = tf.concat([tf.expand_dims(prediction, axis=1), d_pred], axis=1)
+    outputs = []
+    for i in range(total_num_bits // bits_at_once):
+      output, state = lstm_cell(pred[:, i, :], state)
+      outputs.append(tf.expand_dims(output, axis=1))
+    outputs = tf.concat(outputs, axis=1)
+    d_int_pred = discrete_predict(outputs)
+    pred_loss = tf.losses.sparse_softmax_cross_entropy(
+        logits=d_int_pred, labels=d_int)
+    pred_loss = tf.reduce_mean(pred_loss)
+    return target_bits, pred_loss
+
+
 # New API for discretization bottlenecks:
 # * Each method is separate and provides 2 functions:
 # * The [method]_bottleneck function returns discretized state.
@@ -1281,6 +1356,7 @@ def tanh_discrete_bottleneck(x, bottleneck_bits, bottleneck_noise,
                              discretize_warmup_steps, mode):
   """Simple discretization through tanh, flip bottleneck_noise many bits."""
   x = tf.layers.dense(x, bottleneck_bits, name="tanh_discrete_bottleneck")
+  d0 = tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x))) - 1.0
   if mode == tf.estimator.ModeKeys.TRAIN:
     x += tf.truncated_normal(
         common_layers.shape_list(x), mean=0.0, stddev=0.2)
@@ -1292,7 +1368,7 @@ def tanh_discrete_bottleneck(x, bottleneck_bits, bottleneck_noise,
     d *= noise
   d = common_layers.mix(d, x, discretize_warmup_steps,
                         mode == tf.estimator.ModeKeys.TRAIN)
-  return d, 0.0
+  return d, d0
 
 
 def tanh_discrete_unbottleneck(x, hidden_size):
@@ -1345,9 +1421,10 @@ def isemhash_unbottleneck(x, hidden_size, isemhash_filter_size_multiplier=1.0):
 def parametrized_bottleneck(x, hparams):
   """Meta-function calling all the above bottlenecks with hparams."""
   if hparams.bottleneck_kind == "tanh_discrete":
-    return tanh_discrete_bottleneck(
+    d, _ = tanh_discrete_bottleneck(
         x, hparams.bottleneck_bits, hparams.bottleneck_noise * 0.5,
         hparams.discretize_warmup_steps, hparams.mode)
+    return d, 0.0
   if hparams.bottleneck_kind == "isemhash":
     return isemhash_bottleneck(
         x, hparams.bottleneck_bits, hparams.bottleneck_noise * 0.5,
diff --git a/tensor2tensor/models/video/basic_stochastic.py b/tensor2tensor/models/video/basic_stochastic.py
@@ -72,43 +72,26 @@ def inject_latent(self, layer, inputs, target):
     filters = hparams.hidden_size
     kernel = (4, 4)
     layer_shape = common_layers.shape_list(layer)
-    batch_size = layer_shape[0]
-    state_size = hparams.latent_predictor_state_size
-    lstm_cell = tf.contrib.rnn.LSTMCell(state_size)
-    discrete_predict = tfl.Dense(256, name="discrete_predict")
-    discrete_embed = tfl.Dense(state_size, name="discrete_embed")
-
-    def add_d(layer, d):
-      z_mul = tfl.dense(d, final_filters, name="unbottleneck_mul")
+
+    def add_bits(layer, bits):
+      z_mul = tfl.dense(bits, final_filters, name="unbottleneck_mul")
       if not hparams.complex_addn:
         return layer + z_mul
       layer *= tf.nn.sigmoid(z_mul)
-      z_add = tfl.dense(d, final_filters, name="unbottleneck_add")
+      z_add = tfl.dense(bits, final_filters, name="unbottleneck_add")
       layer += z_add
       return layer
 
     if self.is_predicting:
       if hparams.full_latent_tower:
         rand = tf.random_uniform(layer_shape[:-1] + [hparams.bottleneck_bits])
+        bits = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
       else:
-        layer_pred = tfl.flatten(layer)
-        prediction = tfl.dense(layer_pred, state_size, name="istate")
-        c_state = tfl.dense(layer_pred, state_size, name="cstate")
-        m_state = tfl.dense(layer_pred, state_size, name="mstate")
-        state = (c_state, m_state)
-        outputs = []
-        for i in range(hparams.bottleneck_bits // 8):
-          output, state = lstm_cell(prediction, state)
-          discrete_logits = discrete_predict(output)
-          discrete_samples = common_layers.sample_with_temperature(
-              discrete_logits, hparams.latent_predictor_temperature)
-          outputs.append(tf.expand_dims(discrete_samples, axis=1))
-          prediction = discrete_embed(tf.one_hot(discrete_samples, 256))
-        outputs = tf.concat(outputs, axis=1)
-        outputs = discretization.int_to_bit(outputs, 8)
-        rand = tf.reshape(outputs, [batch_size, 1, 1, hparams.bottleneck_bits])
-      d = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
-      return add_d(layer, d), 0.0
+        bits, _ = discretization.predict_bits_with_lstm(
+            layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits,
+            temperature=hparams.latent_predictor_temperature)
+        bits = tf.expand_dims(tf.expand_dims(bits, axis=1), axis=2)
+      return add_bits(layer, bits), 0.0
 
     # Embed.
     frames = tf.concat(inputs + [target], axis=-1)
@@ -131,43 +114,16 @@ def add_d(layer, d):
     else:
       x = common_layers.double_discriminator(x)
       x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1)
-    x = tfl.dense(x, hparams.bottleneck_bits, name="bottleneck")
-    x0 = tf.tanh(x)
-    d = x0 + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x0)) - 1.0 - x0)
-    pred_loss = 0.0
+
+    bits, bits_clean = discretization.tanh_discrete_bottleneck(
+        x, hparams.bottleneck_bits, hparams.bottleneck_noise,
+        hparams.discretize_warmup_steps, hparams.mode)
     if not hparams.full_latent_tower:
-      d_pred = tf.reshape(tf.maximum(tf.stop_gradient(d), 0), [
-          batch_size, hparams.bottleneck_bits // 8, 8])
-      d_int = discretization.bit_to_int(d_pred, 8)
-      tf.summary.histogram("d_int", tf.reshape(d_int, [-1]))
-      d_hot = tf.one_hot(d_int, 256, axis=-1)
-      d_pred = discrete_embed(d_hot)
-      layer_pred = tfl.flatten(layer)
-      prediction0 = tfl.dense(layer_pred, state_size, name="istate")
-      c_state = tfl.dense(layer_pred, state_size, name="cstate")
-      m_state = tfl.dense(layer_pred, state_size, name="mstate")
-      pred = tf.concat([tf.expand_dims(prediction0, axis=1), d_pred], axis=1)
-      state = (c_state, m_state)
-      outputs = []
-      for i in range(hparams.bottleneck_bits // 8):
-        output, state = lstm_cell(pred[:, i, :], state)
-        outputs.append(tf.expand_dims(output, axis=1))
-      outputs = tf.concat(outputs, axis=1)
-      d_int_pred = discrete_predict(outputs)
-      pred_loss = tf.losses.sparse_softmax_cross_entropy(
-          logits=d_int_pred, labels=d_int)
-      pred_loss = tf.reduce_mean(pred_loss)
-    if hparams.mode == tf.estimator.ModeKeys.TRAIN:
-      x += tf.truncated_normal(
-          common_layers.shape_list(x), mean=0.0, stddev=0.2)
-      x = tf.tanh(x)
-      noise = tf.random_uniform(common_layers.shape_list(x))
-      noise = 2.0 * tf.to_float(tf.less(hparams.bottleneck_noise, noise)) - 1.0
-      x *= noise
-      d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x)
-      p = common_layers.inverse_lin_decay(hparams.discrete_warmup_steps)
-      d = tf.where(tf.less(tf.random_uniform([batch_size]), p), d, x)
-    return add_d(layer, d), pred_loss
+      _, pred_loss = discretization.predict_bits_with_lstm(
+          layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits,
+          target_bits=bits_clean)
+
+    return add_bits(layer, bits), pred_loss
 
 
 @registry.register_hparams
@@ -224,7 +180,7 @@ def next_frame_basic_stochastic_discrete():
   hparams.learning_rate_schedule = "linear_warmup * constant"
   hparams.add_hparam("bottleneck_bits", 64)
   hparams.add_hparam("bottleneck_noise", 0.02)
-  hparams.add_hparam("discrete_warmup_steps", 40000)
+  hparams.add_hparam("discretize_warmup_steps", 40000)
   hparams.add_hparam("full_latent_tower", False)
   hparams.add_hparam("latent_predictor_state_size", 128)
   hparams.add_hparam("latent_predictor_temperature", 0.5)
diff --git a/tensor2tensor/rl/trainer_model_based_params.py b/tensor2tensor/rl/trainer_model_based_params.py
@@ -203,7 +203,7 @@ def rlmb_base_stochastic_discrete():
   """Base setting with stochastic discrete model."""
   hparams = rlmb_base()
   hparams.learning_rate_bump = 1.0
-  hparams.grayscale = True
+  hparams.grayscale = False
   hparams.generative_model = "next_frame_basic_stochastic_discrete"
   hparams.generative_model_params = "next_frame_basic_stochastic_discrete"
   return hparams