Remove locally sharding to CPU (it was only for debugging anyway) and enable distribution strategy by default.

Lukasz Kaiser · Copybara-Service · commit 9729521bc3cd · 2018-10-30T17:38:02.000-07:00
PiperOrigin-RevId: 219397234
diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
@@ -65,10 +65,8 @@
 flags.DEFINE_integer("intra_op_parallelism_threads", 0,
                      "Number of intra_op_parallelism_threads to use for CPU. "
                      "See TensorFlow config.proto for details.")
-# TODO(hinsu): Enable DistributionStrategy by default once performance gap
-# between DistributionStrategy and Parallelism is resolved.
 flags.DEFINE_bool(
-    "optionally_use_dist_strat", False,
+    "optionally_use_dist_strat", True,
     "Whether to use TensorFlow DistributionStrategy instead of explicitly "
     "replicating the model. DistributionStrategy is used only if the "
     "model replication configuration is supported by the DistributionStrategy.")
@@ -239,7 +237,6 @@ def create_run_config(hp, output_dir=None):
       keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours,
       num_gpus=FLAGS.worker_gpu,
       gpu_order=FLAGS.gpu_order,
-      shard_to_cpu=FLAGS.locally_shard_to_cpu,
       num_async_replicas=FLAGS.worker_replicas,
       gpu_mem_fraction=FLAGS.worker_gpu_memory_fraction,
       enable_graph_rewriter=FLAGS.enable_graph_rewriter,
diff --git a/tensor2tensor/utils/devices.py b/tensor2tensor/utils/devices.py
@@ -70,7 +70,6 @@ def data_parallelism(daisy_chain_variables=True,
                      worker_replicas=1,
                      worker_id=0,
                      gpu_order="",
-                     locally_shard_to_cpu=False,
                      worker_job="/job:localhost",
                      no_data_parallelism=False):
   """See data_parallelism_from_flags."""
@@ -141,7 +140,7 @@ def _replica_device_setter(worker_device):
         "Schedule=%s. Assuming that training is running on a single machine.",
         schedule)
     datashard_devices = ["gpu:%d" % d for d in _gpu_order(worker_gpu)]
-    if locally_shard_to_cpu or worker_gpu < 1:
+    if worker_gpu < 1:
       datashard_devices += ["cpu:0"]
     caching_devices = None
   elif sync and ps_replicas > 0:
diff --git a/tensor2tensor/utils/flags.py b/tensor2tensor/utils/flags.py
@@ -92,9 +92,6 @@
 flags.DEFINE_integer("eval_throttle_seconds", 600,
                      "Do not re-evaluate unless the last evaluation was started"
                      " at least this many seconds ago.")
-flags.DEFINE_bool("locally_shard_to_cpu", False,
-                  "Use CPU as a sharding device running locally. This allows "
-                  "to test sharded model construction on a machine with 1 GPU.")
 flags.DEFINE_bool("sync", False, "Sync compute on PS.")
 flags.DEFINE_string("worker_job", "/job:localhost", "name of worker job")
 flags.DEFINE_integer("worker_gpu", 1, "How many GPUs to use.")
diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py
@@ -145,7 +145,6 @@ def create_run_config(model_name,
                       keep_checkpoint_every_n_hours=10000,
                       num_gpus=1,
                       gpu_order="",
-                      shard_to_cpu=False,
                       num_async_replicas=1,
                       enable_graph_rewriter=False,
                       gpu_mem_fraction=0.95,
@@ -239,7 +238,7 @@ def create_run_config(model_name,
         optionally_use_dist_strat and
         t2t_model.T2TModel.has_symmetric_shards(model_name) and
         not no_data_parallelism and ps_replicas == 0 and ps_gpu == 0 and
-        num_async_replicas == 1 and not shard_to_cpu)
+        num_async_replicas == 1)
 
     if use_distribution_strategy:
       tf.logging.info(
@@ -262,7 +261,6 @@ def create_run_config(model_name,
           worker_replicas=num_async_replicas,
           worker_id=worker_id,
           gpu_order=gpu_order,
-          locally_shard_to_cpu=shard_to_cpu,
           worker_job=worker_job,
           no_data_parallelism=no_data_parallelism)