Skip to content

Commit c99d752

Browse files
authored
Add support for multiple GPUs (google#42)
1 parent a414d1a commit c99d752

10 files changed

+238
-126
lines changed

README.md

+40
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ or on your own machine. This README provides instructions for both.
2121
* [Accessing Files on Google Cloud](#accessing-files-on-google-cloud)
2222
* [Using Frame-Level Features](#using-frame-level-features)
2323
* [Using Audio Features](#using-audio-features)
24+
* [Using Larger Machine Types](#using-larger-machine-types)
2425
* [Running on Your Own Machine](#running-on-your-own-machine)
2526
* [Requirements](#requirements-1)
2627
* [Training on Video-Level Features](#training-on-video-level-features-1)
2728
* [Evaluation and Inference](#evaluation-and-inference-1)
2829
* [Using Frame-Level Features](#using-frame-level-features-1)
2930
* [Using Audio Features](#using-audio-features-1)
31+
* [Using GPUs](#using-gpus)
3032
* [Ground-Truth Label Files](#ground-truth-label-files)
3133
* [Overview of Models](#overview-of-models)
3234
* [Video-Level Models](#video-level-models)
@@ -317,6 +319,14 @@ Similarly, to use audio-visual Frame-Level features use:
317319
lists provided to the two flags above match. Also, the order must match when
318320
running training, evaluation, or inference.
319321

322+
### Using Larger Machine Types
323+
324+
Some complex frame-level models can take as long as a week to converge when
325+
using only one GPU. You can train these models more quickly by using more
326+
powerful machine types which have additional GPUs. To use a configuration with
327+
4 GPUs, replace the argument to `--config` with `youtube-8m/cloudml-4gpu.yaml`.
328+
Be careful with this argument as it will also increase the rate you are charged
329+
by a factor of 4 as well.
320330

321331
## Running on Your Own Machine
322332

@@ -425,6 +435,36 @@ logistic model trained over the video-level features. Please look at the
425435

426436
See [Using Audio Features](#using-audio-features) section above.
427437

438+
### Using GPUs
439+
440+
If your Tensorflow installation has GPU support, this code will make use of all
441+
of your compatible GPUs. You can verify your installation by running
442+
443+
```
444+
python -c 'import tensorflow as tf; tf.Session()'
445+
```
446+
447+
This will print out something like the following for each of your compatible
448+
GPUs.
449+
450+
```
451+
I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties:
452+
name: Tesla M40
453+
major: 5 minor: 2 memoryClockRate (GHz) 1.112
454+
pciBusID 0000:04:00.0
455+
Total memory: 11.25GiB
456+
Free memory: 11.09GiB
457+
...
458+
```
459+
460+
If at least one GPU was found, the forward and backward passes will be computed
461+
with the GPUs, whereas the CPU will be used primarily for the input and output
462+
pipelines. If you have multiple GPUs, each of them will be given a full batch
463+
of examples, and the resulting gradients will be summed together before being
464+
applied. This will increase your effective batch size. For example, if you set
465+
`batch_size=128` and you have 4 GPUs, this will result in 512 examples being
466+
evaluated every training step.
467+
428468
### Ground-Truth Label Files
429469

430470
We also provide CSV files containing the ground-truth label information of the

cloudml-4gpu.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
trainingInput:
2+
scaleTier: CUSTOM
3+
masterType: complex_model_m_gpu
4+
runtimeVersion: "1.0"

cloudml-gpu.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
trainingInput:
22
scaleTier: CUSTOM
3-
# standard_gpu provides 1 GPU. Change to complex_model_m_gpu for 4 GPUs
43
masterType: standard_gpu
54
runtimeVersion: "1.0"

eval.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def build_graph(reader,
145145
# Normalize input features.
146146
model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
147147

148-
with tf.name_scope("model"):
148+
with tf.variable_scope("tower"):
149149
result = model.create_model(model_input,
150150
num_frames=num_frames,
151151
vocab_size=reader.num_classes,

export_model.py

+6-10
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def export_model(self, model_dir, global_step_val, last_checkpoint):
4949
outputs=self.outputs,
5050
method_name=signature_constants.PREDICT_METHOD_NAME)
5151

52-
signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
52+
signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
5353
signature}
5454

5555
model_builder = saved_model_builder.SavedModelBuilder(model_dir)
@@ -60,24 +60,21 @@ def export_model(self, model_dir, global_step_val, last_checkpoint):
6060
model_builder.save()
6161

6262
def build_inputs_and_outputs(self):
63-
6463
if self.frame_features:
65-
6664
serialized_examples = tf.placeholder(tf.string, shape=(None,))
6765

6866
fn = lambda x: self.build_prediction_graph(x)
6967
video_id_output, top_indices_output, top_predictions_output = (
70-
tf.map_fn(fn, serialized_examples,
68+
tf.map_fn(fn, serialized_examples,
7169
dtype=(tf.string, tf.int32, tf.float32)))
7270

7371
else:
74-
7572
serialized_examples = tf.placeholder(tf.string, shape=(None,))
7673

7774
video_id_output, top_indices_output, top_predictions_output = (
7875
self.build_prediction_graph(serialized_examples))
7976

80-
inputs = {"example_bytes":
77+
inputs = {"example_bytes":
8178
saved_model_utils.build_tensor_info(serialized_examples)}
8279

8380
outputs = {
@@ -87,15 +84,14 @@ def build_inputs_and_outputs(self):
8784

8885
return inputs, outputs
8986

90-
def build_prediction_graph(self, serialized_examples):
91-
87+
def build_prediction_graph(self, serialized_examples):
9288
video_id, model_input_raw, labels_batch, num_frames = (
9389
self.reader.prepare_serialized_examples(serialized_examples))
9490

9591
feature_dim = len(model_input_raw.get_shape()) - 1
9692
model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
9793

98-
with tf.name_scope("model"):
94+
with tf.variable_scope("tower"):
9995
result = self.model.create_model(
10096
model_input,
10197
num_frames=num_frames,
@@ -108,6 +104,6 @@ def build_prediction_graph(self, serialized_examples):
108104

109105
predictions = result["predictions"]
110106

111-
top_predictions, top_indices = tf.nn.top_k(predictions,
107+
top_predictions, top_indices = tf.nn.top_k(predictions,
112108
_TOP_PREDICTIONS_IN_OUTPUT)
113109
return video_id, top_indices, top_predictions

frame_level_models.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,6 @@ def create_model(self, model_input, vocab_size, num_frames, **unused_params):
214214
lstm_size = FLAGS.lstm_cells
215215
number_of_layers = FLAGS.lstm_layers
216216

217-
## Batch normalize the input
218217
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
219218
[
220219
tf.contrib.rnn.BasicLSTMCell(
@@ -223,13 +222,14 @@ def create_model(self, model_input, vocab_size, num_frames, **unused_params):
223222
], state_is_tuple=False)
224223

225224
loss = 0.0
226-
with tf.variable_scope("RNN"):
227-
outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
228-
sequence_length=num_frames,
229-
dtype=tf.float32)
225+
226+
outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
227+
sequence_length=num_frames,
228+
dtype=tf.float32)
230229

231230
aggregated_model = getattr(video_level_models,
232231
FLAGS.video_level_classifier_model)
232+
233233
return aggregated_model().create_model(
234234
model_input=state,
235235
vocab_size=vocab_size,

inference.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,12 @@ def get_input_data_tensors(reader, data_pattern, batch_size, num_readers=1):
106106
video_id_batch, video_batch, unused_labels, num_frames_batch = (
107107
tf.train.batch_join(examples_and_labels,
108108
batch_size=batch_size,
109-
allow_smaller_final_batch = True,
109+
allow_smaller_final_batch=True,
110110
enqueue_many=True))
111111
return video_id_batch, video_batch, num_frames_batch
112112

113113
def inference(reader, train_dir, data_pattern, out_file_location, batch_size, top_k):
114-
with tf.Session() as sess, gfile.Open(out_file_location, "w+") as out_file:
114+
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess, gfile.Open(out_file_location, "w+") as out_file:
115115
video_id_batch, video_batch, num_frames_batch = get_input_data_tensors(reader, data_pattern, batch_size)
116116
latest_checkpoint = tf.train.latest_checkpoint(train_dir)
117117
if latest_checkpoint is None:

readers.py

-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ def prepare_serialized_examples(self, serialized_examples):
121121
[self.feature_sizes[feature_index]], tf.float32)
122122

123123
features = tf.parse_example(serialized_examples, features=feature_map)
124-
125124
labels = tf.sparse_to_indicator(features["labels"], self.num_classes)
126125
labels.set_shape([None, self.num_classes])
127126
concatenated_features = tf.concat([

0 commit comments

Comments
 (0)