Add support for multiple GPUs (google#42)

LeegleechN · web-flow · commit c99d752586a8 · 2017-03-22T12:09:54.000-07:00
diff --git a/README.md b/README.md
@@ -21,12 +21,14 @@ or on your own machine. This README provides instructions for both.
    * [Accessing Files on Google Cloud](#accessing-files-on-google-cloud)
    * [Using Frame-Level Features](#using-frame-level-features)
    * [Using Audio Features](#using-audio-features)
+   * [Using Larger Machine Types](#using-larger-machine-types)
 * [Running on Your Own Machine](#running-on-your-own-machine)
    * [Requirements](#requirements-1)
    * [Training on Video-Level Features](#training-on-video-level-features-1)
    * [Evaluation and Inference](#evaluation-and-inference-1)
    * [Using Frame-Level Features](#using-frame-level-features-1)
    * [Using Audio Features](#using-audio-features-1)
+   * [Using GPUs](#using-gpus)
    * [Ground-Truth Label Files](#ground-truth-label-files)
 * [Overview of Models](#overview-of-models)
    * [Video-Level Models](#video-level-models)
@@ -317,6 +319,14 @@ Similarly, to use audio-visual Frame-Level features use:
 lists provided to the two flags above match. Also, the order must match when
 running training, evaluation, or inference.
 
+### Using Larger Machine Types
+
+Some complex frame-level models can take as long as a week to converge when
+using only one GPU. You can train these models more quickly by using more
+powerful machine types which have additional GPUs. To use a configuration with
+4 GPUs, replace the argument to `--config` with `youtube-8m/cloudml-4gpu.yaml`.
+Be careful with this argument as it will also increase the rate you are charged
+by a factor of 4 as well.
 
 ## Running on Your Own Machine
 
@@ -425,6 +435,36 @@ logistic model trained over the video-level features. Please look at the
 
 See [Using Audio Features](#using-audio-features) section above.
 
+### Using GPUs
+
+If your Tensorflow installation has GPU support, this code will make use of all
+of your compatible GPUs. You can verify your installation by running
+
+```
+python -c 'import tensorflow as tf; tf.Session()'
+```
+
+This will print out something like the following for each of your compatible
+GPUs.
+
+```
+I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties:
+name: Tesla M40
+major: 5 minor: 2 memoryClockRate (GHz) 1.112
+pciBusID 0000:04:00.0
+Total memory: 11.25GiB
+Free memory: 11.09GiB
+...
+```
+
+If at least one GPU was found, the forward and backward passes will be computed
+with the GPUs, whereas the CPU will be used primarily for the input and output
+pipelines. If you have multiple GPUs, each of them will be given a full batch
+of examples, and the resulting gradients will be summed together before being
+applied. This will increase your effective batch size. For example, if you set
+`batch_size=128` and you have 4 GPUs, this will result in 512 examples being
+evaluated every training step.
+
 ### Ground-Truth Label Files
 
 We also provide CSV files containing the ground-truth label information of the
diff --git a/cloudml-4gpu.yaml b/cloudml-4gpu.yaml
@@ -0,0 +1,4 @@
+trainingInput:
+  scaleTier: CUSTOM
+  masterType: complex_model_m_gpu
+  runtimeVersion: "1.0"
diff --git a/cloudml-gpu.yaml b/cloudml-gpu.yaml
@@ -1,5 +1,4 @@
 trainingInput:
   scaleTier: CUSTOM
-  # standard_gpu provides 1 GPU. Change to complex_model_m_gpu for 4 GPUs
   masterType: standard_gpu
   runtimeVersion: "1.0"
diff --git a/eval.py b/eval.py
@@ -145,7 +145,7 @@ def build_graph(reader,
   # Normalize input features.
   model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
 
-  with tf.name_scope("model"):
+  with tf.variable_scope("tower"):
     result = model.create_model(model_input,
                                 num_frames=num_frames,
                                 vocab_size=reader.num_classes,
diff --git a/export_model.py b/export_model.py
@@ -49,7 +49,7 @@ def export_model(self, model_dir, global_step_val, last_checkpoint):
             outputs=self.outputs,
             method_name=signature_constants.PREDICT_METHOD_NAME)
 
-        signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: 
+        signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                          signature}
 
         model_builder = saved_model_builder.SavedModelBuilder(model_dir)
@@ -60,24 +60,21 @@ def export_model(self, model_dir, global_step_val, last_checkpoint):
         model_builder.save()
 
   def build_inputs_and_outputs(self):
-
     if self.frame_features:
-
       serialized_examples = tf.placeholder(tf.string, shape=(None,))
 
       fn = lambda x: self.build_prediction_graph(x)
       video_id_output, top_indices_output, top_predictions_output = (
-          tf.map_fn(fn, serialized_examples, 
+          tf.map_fn(fn, serialized_examples,
                     dtype=(tf.string, tf.int32, tf.float32)))
 
     else:
-
       serialized_examples = tf.placeholder(tf.string, shape=(None,))
 
       video_id_output, top_indices_output, top_predictions_output = (
           self.build_prediction_graph(serialized_examples))
 
-    inputs = {"example_bytes": 
+    inputs = {"example_bytes":
               saved_model_utils.build_tensor_info(serialized_examples)}
 
     outputs = {
@@ -87,15 +84,14 @@ def build_inputs_and_outputs(self):
 
     return inputs, outputs
 
-  def build_prediction_graph(self, serialized_examples):    
-
+  def build_prediction_graph(self, serialized_examples):
     video_id, model_input_raw, labels_batch, num_frames = (
         self.reader.prepare_serialized_examples(serialized_examples))
 
     feature_dim = len(model_input_raw.get_shape()) - 1
     model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
 
-    with tf.name_scope("model"):
+    with tf.variable_scope("tower"):
       result = self.model.create_model(
           model_input,
           num_frames=num_frames,
@@ -108,6 +104,6 @@ def build_prediction_graph(self, serialized_examples):
 
       predictions = result["predictions"]
 
-      top_predictions, top_indices = tf.nn.top_k(predictions, 
+      top_predictions, top_indices = tf.nn.top_k(predictions,
           _TOP_PREDICTIONS_IN_OUTPUT)
     return video_id, top_indices, top_predictions
diff --git a/frame_level_models.py b/frame_level_models.py
@@ -214,7 +214,6 @@ def create_model(self, model_input, vocab_size, num_frames, **unused_params):
     lstm_size = FLAGS.lstm_cells
     number_of_layers = FLAGS.lstm_layers
 
-    ## Batch normalize the input
     stacked_lstm = tf.contrib.rnn.MultiRNNCell(
             [
                 tf.contrib.rnn.BasicLSTMCell(
@@ -223,13 +222,14 @@ def create_model(self, model_input, vocab_size, num_frames, **unused_params):
                 ], state_is_tuple=False)
 
     loss = 0.0
-    with tf.variable_scope("RNN"):
-      outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
-                                         sequence_length=num_frames,
-                                         dtype=tf.float32)
+
+    outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
+                                       sequence_length=num_frames,
+                                       dtype=tf.float32)
 
     aggregated_model = getattr(video_level_models,
                                FLAGS.video_level_classifier_model)
+
     return aggregated_model().create_model(
         model_input=state,
         vocab_size=vocab_size,
diff --git a/inference.py b/inference.py
@@ -106,12 +106,12 @@ def get_input_data_tensors(reader, data_pattern, batch_size, num_readers=1):
     video_id_batch, video_batch, unused_labels, num_frames_batch = (
         tf.train.batch_join(examples_and_labels,
                             batch_size=batch_size,
-                            allow_smaller_final_batch = True,
+                            allow_smaller_final_batch=True,
                             enqueue_many=True))
     return video_id_batch, video_batch, num_frames_batch
 
 def inference(reader, train_dir, data_pattern, out_file_location, batch_size, top_k):
-  with tf.Session() as sess, gfile.Open(out_file_location, "w+") as out_file:
+  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess, gfile.Open(out_file_location, "w+") as out_file:
     video_id_batch, video_batch, num_frames_batch = get_input_data_tensors(reader, data_pattern, batch_size)
     latest_checkpoint = tf.train.latest_checkpoint(train_dir)
     if latest_checkpoint is None:
diff --git a/readers.py b/readers.py
@@ -121,7 +121,6 @@ def prepare_serialized_examples(self, serialized_examples):
           [self.feature_sizes[feature_index]], tf.float32)
 
     features = tf.parse_example(serialized_examples, features=feature_map)
-
     labels = tf.sparse_to_indicator(features["labels"], self.num_classes)
     labels.set_shape([None, self.num_classes])
     concatenated_features = tf.concat([
diff --git a/train.py b/train.py
diff --git a/utils.py b/utils.py