internal merge of PR tensorflow#1411

lgeiger · kpe · commit 50a358847b8e · 2019-03-02T23:17:27.000+01:00
PiperOrigin-RevId: 231608988
diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py
@@ -122,7 +122,7 @@ def preprocess_example(self, example, mode, hparams):
       # This replaces CMVN estimation on data
       var_epsilon = 1e-09
       mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
-      variance = tf.reduce_mean(tf.squared_difference(mel_fbanks, mean),
+      variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                 keepdims=True, axis=1)
       mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon)
 
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
@@ -75,8 +75,6 @@ def basic_params1():
       # Mixed precision training only supports exponential scaling currently
       # To disable the scaler, see to 0/False
       mixed_precision_optimizer_loss_scaler="exponential",
-      # Determines the initial loss scaling value for mixed precision
-      mixed_precision_optimizer_init_loss_scale=2**15,
       # Whether to zero gradients that were not computed, so that the
       # appropriate slots are created. Useful for sharing checkpoints between
       # models with different sets of heads.
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
@@ -256,7 +256,7 @@ def standardize_images(x):
     x = tf.to_float(tf.reshape(x, [-1] + x_shape[-3:]))
     x_mean = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
     x_variance = tf.reduce_mean(
-        tf.squared_difference(x, x_mean), axis=[1, 2], keepdims=True)
+        tf.square(x - x_mean), axis=[1, 2], keepdims=True)
     num_pixels = tf.to_float(x_shape[-2] * x_shape[-3])
     x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
     return tf.reshape(x, x_shape)
@@ -634,8 +634,7 @@ def layer_norm_compute(x, epsilon, scale, bias):
   """Layer norm raw computation."""
   epsilon, scale, bias = [cast_like(t, x) for t in [epsilon, scale, bias]]
   mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
-  variance = tf.reduce_mean(
-      tf.squared_difference(x, mean), axis=[-1], keepdims=True)
+  variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
   norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
   return norm_x * scale + bias
 
@@ -691,8 +690,7 @@ def l2_norm(x, filters=None, epsilon=1e-6, name=None, reuse=None):
         "l2_norm_bias", [filters], initializer=tf.zeros_initializer())
     epsilon, scale, bias = [cast_like(t, x) for t in [epsilon, scale, bias]]
     mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
-    l2norm = tf.reduce_sum(
-        tf.squared_difference(x, mean), axis=[-1], keepdims=True)
+    l2norm = tf.reduce_sum(tf.square(x - mean), axis=[-1], keepdims=True)
     norm_x = (x - mean) * tf.rsqrt(l2norm + epsilon)
     return norm_x * scale + bias
 
@@ -3348,7 +3346,7 @@ def get_sorted_projections(x):
 
     proj1 = get_sorted_projections(logits1)
     proj2 = get_sorted_projections(logits2)
-    dist = tf.reduce_mean(tf.squared_difference(proj1, proj2))
+    dist = tf.reduce_mean(tf.square(proj1 - proj2))
     if return_logits:
       return dist, logits1, logits2
     return dist
diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
@@ -217,8 +217,8 @@ def embedding_lookup(x,
 
   # Currently, we use the mean scaling for the commitment loss, as opposed to
   # summing across all non-batch dimensions.
-  q_loss = tf.reduce_mean(tf.squared_difference(tf.stop_gradient(x), x_means))
-  e_loss = tf.reduce_mean(tf.squared_difference(x, tf.stop_gradient(x_means)))
+  q_loss = tf.reduce_mean(tf.square((tf.stop_gradient(x) - x_means)))
+  e_loss = tf.reduce_mean(tf.square(x - tf.stop_gradient(x_means)))
   return x_means_hot, x_means, q_loss, e_loss, neg_q_entropy
 
 
@@ -469,8 +469,7 @@ def gumbel_softmax(x,
     # Add losses that prevent too few being used.
     distrib = tf.reshape(logsm, [-1, 2**z_size]) * maxvhot
     d_mean = tf.reduce_mean(distrib, axis=[0], keep_dims=True)
-    d_variance = tf.reduce_mean(
-        tf.squared_difference(distrib, d_mean), axis=[0])
+    d_variance = tf.reduce_mean(tf.square(distrib - d_mean), axis=[0])
     d_dev = -tf.reduce_mean(d_variance)
     ret = s
 
@@ -925,7 +924,7 @@ def vq_nearest_neighbor(x, means,
     x_means_hot = tf.one_hot(x_means_idx, bottleneck_size)
   x_means_hot_flat = tf.reshape(x_means_hot, [-1, bottleneck_size])
   x_means = tf.matmul(x_means_hot_flat, means)
-  e_loss = tf.reduce_mean(tf.squared_difference(x, tf.stop_gradient(x_means)))
+  e_loss = tf.reduce_mean(tf.square(x - tf.stop_gradient(x_means)))
   return x_means_hot, e_loss, dist
 
 
@@ -1334,8 +1333,7 @@ def gumbel_softmax_discrete_bottleneck(x,
   x_means_assignments_flat = tf.reshape(x_means_assignments,
                                         [-1, bottleneck_size])
   x_means = tf.matmul(x_means_assignments_flat, means)
-  commitment_loss = tf.reduce_mean(
-      tf.squared_difference(x, tf.stop_gradient(x_means)))
+  commitment_loss = tf.reduce_mean(tf.square(x - tf.stop_gradient(x_means)))
 
   # Update the ema variables.
   updated_ema_count = moving_averages.assign_moving_average(
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
@@ -736,8 +736,7 @@ class VideoModalityL2(VideoModalityL1):
   """Modality for videos with L2 loss."""
 
   def internal_loss(self, logits, targets):
-    return tf.nn.relu(
-        tf.squared_difference(logits, targets) - self.cutoff * self.cutoff)
+    return tf.nn.relu((logits - targets)**2 - self.cutoff * self.cutoff)
 
 
 class VideoModalityL2Raw(VideoModalityL2):
@@ -917,7 +916,7 @@ def targets_bottom(self, x):
     return tf.to_float(x)
 
   def loss(self, body_output, targets):
-    loss = tf.squared_difference(body_output, tf.to_float(targets))
+    loss = tf.square(body_output - tf.to_float(targets))
     return tf.reduce_mean(loss), tf.constant(1.0)
 
 
diff --git a/tensor2tensor/layers/vq_discrete.py b/tensor2tensor/layers/vq_discrete.py
@@ -138,10 +138,8 @@ def embedding_lookup(self, x, means):
         x_means_hot, [-1, self.hparams.num_blocks, self.hparams.block_v_size])
     x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means)
     x_means = tf.transpose(x_means, [1, 0, 2])
-    q_loss = tf.reduce_mean(
-        tf.squared_difference(tf.stop_gradient(x), x_means))
-    e_loss = tf.reduce_mean(
-        tf.squared_difference(x, tf.stop_gradient(x_means)))
+    q_loss = tf.reduce_mean(tf.square((tf.stop_gradient(x) - x_means)))
+    e_loss = tf.reduce_mean((x - tf.stop_gradient(x_means))**2)
     return x_means_hot, x_means, q_loss, e_loss
 
   def bit_to_int(self, x_bit, num_bits, base=2):
diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py
@@ -38,7 +38,6 @@
 from tensor2tensor.models import revnet
 from tensor2tensor.models import shake_shake
 from tensor2tensor.models import slicenet
-from tensor2tensor.models import text_cnn
 from tensor2tensor.models import transformer
 from tensor2tensor.models import vanilla_gan
 from tensor2tensor.models import xception
diff --git a/tensor2tensor/models/research/autoencoders.py b/tensor2tensor/models/research/autoencoders.py
@@ -219,8 +219,7 @@ def body(self, features):
         # minimized by just setting x=0 and b=0 -- so we don't want too much
         # of the influence of this, and we stop-gradient to not zero-out x.
         x_stop = tf.stop_gradient(x)
-        xb_loss = tf.reduce_mean(tf.reduce_sum(
-            tf.squared_difference(x_stop, b), axis=-1))
+        xb_loss = tf.reduce_mean(tf.reduce_sum(tf.square(x_stop - b), axis=-1))
         # To prevent this loss from exploding we clip at 1, but anneal clipping.
         clip_max = 1.0 / common_layers.inverse_exp_decay(
             warm_step, min_value=0.001)
diff --git a/tensor2tensor/models/research/transformer_nat.py b/tensor2tensor/models/research/transformer_nat.py
@@ -65,7 +65,7 @@ def vq_nearest_neighbor(x, hparams):
     x_means_idx = tf.argmax(-dist, axis=-1)
     x_means_hot = tf.one_hot(x_means_idx, depth=bottleneck_size)
   x_means = tf.matmul(x_means_hot, means)
-  e_loss = tf.reduce_mean(tf.squared_difference(x, tf.stop_gradient(x_means)))
+  e_loss = tf.reduce_mean(tf.square(x - tf.stop_gradient(x_means)))
   return x_means_hot, e_loss
 
 
diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py
@@ -436,8 +436,7 @@ def ae_transformer_internal(inputs,
         losses["neg_q_entropy"] = neg_q_entropy * hparams.entropy_scale
       else:
         inputs_c = decode_transformer(inputs, ed, targets_c, hparams, "dec_c")
-        losses["latent_pred"] = tf.reduce_mean(
-            tf.squared_difference(inputs_c, targets_c)) * 20
+        losses["latent_pred"] = tf.reduce_mean((inputs_c - targets_c)**2) * 20
         def bn_inputs():
           with tf.variable_scope(tf.get_variable_scope(), reuse=True):
             bn, _, _, _, _ = hparams.bottleneck(
diff --git a/tensor2tensor/models/research/universal_transformer.py b/tensor2tensor/models/research/universal_transformer.py
@@ -459,6 +459,7 @@ def universal_transformer_base_tpu():
   hparams.add_step_timing_signal = False
   return hparams
 
+
 @registry.register_hparams
 def universal_transformer_big():
   hparams = universal_transformer_base()
@@ -468,13 +469,6 @@ def universal_transformer_big():
   return hparams
 
 
-@registry.register_hparams
-def universal_transformer_base_fp16():
-  hparams = transformer.transformer_base()
-  hparams = update_hparams_for_universal_transformer(hparams)
-  hparams.activation_dtype = 'float16'
-  return hparams
-
 @registry.register_hparams
 def universal_transformer_small():
   hparams = transformer.transformer_base()
diff --git a/tensor2tensor/models/research/universal_transformer_util.py b/tensor2tensor/models/research/universal_transformer_util.py
@@ -1442,7 +1442,7 @@ def add_position_timing_signal(x, step, hparams):
       length, channels, start_index=index)
 
   if hparams.add_or_concat_timing_signal == "add":
-    x_with_timing = x + common_layers.cast_like(signal, x)
+    x_with_timing = x + signal
 
   elif hparams.add_or_concat_timing_signal == "concat":
     batch_size = common_layers.shape_list(x)[0]
@@ -1479,7 +1479,7 @@ def add_step_timing_signal(x, step, hparams):
         channels, step, num_steps)
 
   if hparams.add_or_concat_timing_signal == "add":
-    x_with_timing = x + common_layers.cast_like(signal, x)
+    x_with_timing = x + signal
 
   elif hparams.add_or_concat_timing_signal == "concat":
     batch_size = common_layers.shape_list(x)[0]
diff --git a/tensor2tensor/models/revnet_test.py b/tensor2tensor/models/revnet_test.py
@@ -24,15 +24,13 @@ class RevnetTest(tf.test.TestCase):
   def testH(self):
     rev_block_input = tf.random_uniform([1, 299, 299, 3])
     rev_block_output = revnet.downsample_bottleneck(rev_block_input, 256)
-    self.assertEqual(rev_block_output.get_shape().as_list(),
-                      [1, 299, 299, 256])
+    self.assertEqual(rev_block_output.get_shape().as_list(), [1, 299, 299, 256])
 
   def testHStride(self):
     rev_block_input = tf.random_uniform([2, 299, 299, 256])
     rev_block_output = revnet.downsample_bottleneck(
         rev_block_input, 512, stride=2, scope='HStride')
-    self.assertEqual(rev_block_output.get_shape().as_list(),
-                      [2, 150, 150, 512])
+    self.assertEqual(rev_block_output.get_shape().as_list(), [2, 150, 150, 512])
 
   def testInit(self):
     images = tf.random_uniform([1, 299, 299, 3])
diff --git a/tensor2tensor/models/video/epva.py b/tensor2tensor/models/video/epva.py
@@ -590,8 +590,7 @@ def mean_squared_error(true, pred):
   Returns:
     mean squared error between ground truth and predicted image.
   """
-  result = tf.reduce_sum(
-      tf.squared_difference(true, pred)) / tf.to_float(tf.size(pred))
+  result = tf.reduce_sum(tf.square(true - pred)) / tf.to_float(tf.size(pred))
   return result
 
 
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
@@ -364,7 +364,7 @@ def cv_squared(x):
   epsilon = 1e-10
   float_size = tf.to_float(tf.size(x)) + epsilon
   mean = tf.reduce_sum(x) / float_size
-  variance = tf.reduce_sum(tf.squared_difference(x, mean)) / float_size
+  variance = tf.reduce_sum(tf.square(x - mean)) / float_size
   return variance / (tf.square(mean) + epsilon)
 
 
diff --git a/tensor2tensor/utils/optimize.py b/tensor2tensor/utils/optimize.py
@@ -204,11 +204,9 @@ def __init__(self, optimizer_name, lr, hparams, use_tpu=False):  # pylint: disab
         raise ValueError("Mixed precision training only supports the "
                          "exponential loss scaler")
       else:
-        tf.logging.info(("Using Exponential Update Loss Scaler with",
-                         "init loss scale of {}".format(
-                           hparams.mixed_precision_optimizer_init_loss_scale)))
+        tf.logging.info("Using Exponential Update Loss Scaler")
         manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(
-            init_loss_scale=hparams.mixed_precision_optimizer_init_loss_scale,
+            init_loss_scale=2**15,
             incr_every_n_steps=2000,
             decr_every_n_nan_or_inf=2,
             incr_ratio=2,
diff --git a/tensor2tensor/utils/yellowfin.py b/tensor2tensor/utils/yellowfin.py
@@ -392,7 +392,7 @@ def _get_lr_tensor(self):
     Returns:
       The lr_t.
     """
-    lr = tf.squared_difference(1.0, tf.sqrt(self._mu)) / self._h_min
+    lr = (1.0 - tf.sqrt(self._mu))**2 / self._h_min
     return lr
 
   def _get_mu_tensor(self):