Add support for batch prediction. (google#37)

vicaire · LeegleechN · commit 3160dadadb4e · 2017-03-07T13:37:10.000-08:00
diff --git a/README.md b/README.md
@@ -17,6 +17,7 @@ or on your own machine. This README provides instructions for both.
    * [Testing Locally](#testing-locally)
    * [Training on the Cloud over Video-Level Features](#training-on-video-level-features)
    * [Evaluation and Inference](#evaluation-and-inference)
+   * [Inference Using Batch Prediction](#inference-using-batch-prediction)
    * [Accessing Files on Google Cloud](#accessing-files-on-google-cloud)
    * [Using Frame-Level Features](#using-frame-level-features)
    * [Using Audio Features](#using-audio-features)
@@ -187,6 +188,62 @@ and the following for the inference code:
 num examples processed: 8192 elapsed seconds: 14.85
 ```
 
+### Inference Using Batch Prediction
+To perform inference faster, you can also use the Cloud ML batch prediction
+service.
+
+First, find the directory where the training job exported the model: 
+
+```
+gsutil list ${BUCKET_NAME}/yt8m_train_video_level_logistic_model/export
+```
+
+You should see an output similar to this one: 
+
+```
+${BUCKET_NAME}/yt8m_train_video_level_logistic_model/export/
+${BUCKET_NAME}/yt8m_train_video_level_logistic_model/export/step_1/
+${BUCKET_NAME}/yt8m_train_video_level_logistic_model/export/step_1001/
+${BUCKET_NAME}/yt8m_train_video_level_logistic_model/export/step_2001/
+${BUCKET_NAME}/yt8m_train_video_level_logistic_model/export/step_3001/
+```
+
+Select the latest version of the model that was saved. For instance, in our
+case, we select the version of the model that was saved at step 3001:
+
+```
+EXPORTED_MODEL_DIR=${BUCKET_NAME}/yt8m_train_video_level_logistic_model/export/step_3001/
+```
+
+Start the batch prediction job using the following command:
+
+```
+JOB_NAME=yt8m_batch_predict_$(date +%Y%m%d_%H%M%S); \
+gcloud beta ml jobs submit prediction ${JOB_NAME} --verbosity=debug \
+--model-dir=${EXPORTED_MODEL_DIR} --data-format=TF_RECORD \
+--input-paths=gs://youtube8m-ml/1/video_level/test/test* \
+--output-path=${BUCKET_NAME}/batch_predict/${JOB_NAME} --region=us-east1 \
+--runtime-version=1.0 --max-worker-count=10
+```
+
+You can check the progress of the job on the
+[Google Cloud ML Jobs console](https://console.cloud.google.com/ml/jobs). To
+have the job complete faster, you can increase 'max-worker-count' to a
+higher value.
+
+Once the batch prediction job has completed, turn its output into a submission
+in the CVS format by running the following commands: 
+
+```
+# Copy the output of the batch prediction job to a local directory
+mkdir -p /tmp/batch_predict/${JOB_NAME}
+gsutil -m cp -r ${BUCKET_NAME}/batch_predict/${JOB_NAME}/* /tmp/batch_predict/${JOB_NAME}/
+
+# Convert the output of the batch prediction job into a CVS file ready for submission
+python youtube-8m/convert_prediction_from_json_to_csv.py \
+--json_prediction_files_pattern="/tmp/batch_predict/${JOB_NAME}/prediction.results-*" \
+--csv_output_file="/tmp/batch_predict/${JOB_NAME}/output.csv"
+```
 
 ### Accessing Files on Google Cloud
 
@@ -428,6 +485,8 @@ This sample code contains implementations of the models given in the
                              level features as input.
 *   `model_util.py`: Contains functions that are of general utility for
                      implementing models.
+*   `export_model.py`: Provides a class to export a model during training
+                       for later use in batch prediction.
 *   `readers.py`: Contains definitions for the Video dataset and Frame
                   dataset readers.
 
@@ -446,6 +505,8 @@ This sample code contains implementations of the models given in the
 ### Misc
 *   `README.md`: This documentation.
 *   `utils.py`: Common functions.
+*   `convert_prediction_from_json_to_csv.py`: Converts the JSON output of
+        batch prediction into a CSV file for submission.
 
 ## About This Project
 This project is meant help people quickly get started working with the
diff --git a/convert_prediction_from_json_to_csv.py b/convert_prediction_from_json_to_csv.py
@@ -0,0 +1,103 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility to convert the output of batch prediction into a CSV submission.
+
+It converts the JSON files created by the command
+'gcloud beta ml jobs submit prediction' into a CSV file ready for submission.
+"""
+
+import json
+import tensorflow as tf
+
+from builtins import range
+from tensorflow import app
+from tensorflow import flags
+from tensorflow import gfile
+from tensorflow import logging
+
+
+FLAGS = flags.FLAGS
+
+if __name__ == '__main__':
+
+  flags.DEFINE_string(
+      "json_prediction_files_pattern", None,
+      "Pattern specifying the list of JSON files that the command "
+      "'gcloud beta ml jobs submit prediction' outputs. These files are "
+      "located in the output path of the prediction command and are prefixed "
+      "with 'prediction.results'.")
+  flags.DEFINE_string(
+      "csv_output_file", None,
+      "The file to save the predictions converted to the CSV format.")
+
+
+def get_csv_header():
+  return "VideoId,LabelConfidencePairs\n"
+
+def to_csv_row(json_data):
+
+  video_id = json_data["video_id"]
+
+  class_indexes = json_data["class_indexes"]
+  predictions = json_data["predictions"]
+
+  if isinstance(video_id, list):
+    video_id = video_id[0]
+    class_indexes = class_indexes[0]
+    predictions = predictions[0]
+
+  if len(class_indexes) != len(predictions):
+    raise ValueError(
+        "The number of indexes (%s) and predictions (%s) must be equal." 
+        % (len(class_indexes), len(predictions)))
+
+  return (video_id.decode('utf-8') + "," + " ".join("%i %f" % 
+      (class_indexes[i], predictions[i]) 
+      for i in range(len(class_indexes))) + "\n")
+
+def main(unused_argv):
+  logging.set_verbosity(tf.logging.INFO)
+
+  if not FLAGS.json_prediction_files_pattern:
+    raise ValueError(
+        "The flag --json_prediction_files_pattern must be specified.")
+
+  if not FLAGS.csv_output_file:
+    raise ValueError("The flag --csv_output_file must be specified.")
+
+  logging.info("Looking for prediction files with pattern: %s", 
+               FLAGS.json_prediction_files_pattern)
+
+  file_paths = gfile.Glob(FLAGS.json_prediction_files_pattern)  
+  logging.info("Found files: %s", file_paths)
+
+  logging.info("Writing submission file to: %s", FLAGS.csv_output_file)
+  with gfile.Open(FLAGS.csv_output_file, "w+") as output_file:
+    output_file.write(get_csv_header())
+
+    for file_path in file_paths:
+      logging.info("processing file: %s", file_path)
+
+      with gfile.Open(file_path) as input_file:
+
+        for line in input_file: 
+          json_data = json.loads(line)
+          output_file.write(to_csv_row(json_data))
+
+    output_file.flush()
+  logging.info("done")
+
+if __name__ == "__main__":
+  app.run()
diff --git a/export_model.py b/export_model.py
@@ -0,0 +1,111 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities to export a model for batch prediction."""
+
+import tensorflow as tf
+import tensorflow.contrib.slim as slim
+
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import utils as saved_model_utils
+
+_TOP_PREDICTIONS_IN_OUTPUT = 20
+
+class ModelExporter(object):
+
+  def __init__(self, frame_features, model, reader):
+    self.frame_features = frame_features
+    self.model = model
+    self.reader = reader
+
+    with tf.Graph().as_default() as graph:
+      self.inputs, self.outputs = self.build_inputs_and_outputs()
+      self.graph = graph
+      self.saver = tf.train.Saver(tf.trainable_variables(), sharded=True)
+
+  def export_model(self, model_dir, global_step_val, last_checkpoint):
+    """Exports the model so that it can used for batch predictions."""
+
+    with self.graph.as_default():
+      with tf.Session() as session:
+        self.saver.restore(session, last_checkpoint)
+
+        signature = signature_def_utils.build_signature_def(
+            inputs=self.inputs,
+            outputs=self.outputs,
+            method_name=signature_constants.PREDICT_METHOD_NAME)
+
+        signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: 
+                         signature}
+
+        model_builder = saved_model_builder.SavedModelBuilder(model_dir)
+        model_builder.add_meta_graph_and_variables(session,
+            tags=[tag_constants.SERVING],
+            signature_def_map=signature_map,
+            clear_devices=True)
+        model_builder.save()
+
+  def build_inputs_and_outputs(self):
+
+    if self.frame_features:
+
+      serialized_examples = tf.placeholder(tf.string, shape=(None,))
+
+      fn = lambda x: self.build_prediction_graph(x)
+      video_id_output, top_indices_output, top_predictions_output = (
+          tf.map_fn(fn, serialized_examples, 
+                    dtype=(tf.string, tf.int32, tf.float32)))
+
+    else:
+
+      serialized_examples = tf.placeholder(tf.string, shape=(None,))
+
+      video_id_output, top_indices_output, top_predictions_output = (
+          self.build_prediction_graph(serialized_examples))
+
+    inputs = {"example_bytes": 
+              saved_model_utils.build_tensor_info(serialized_examples)}
+
+    outputs = {
+        "video_id": saved_model_utils.build_tensor_info(video_id_output),
+        "class_indexes": saved_model_utils.build_tensor_info(top_indices_output),
+        "predictions": saved_model_utils.build_tensor_info(top_predictions_output)}
+
+    return inputs, outputs
+
+  def build_prediction_graph(self, serialized_examples):    
+
+    video_id, model_input_raw, labels_batch, num_frames = (
+        self.reader.prepare_serialized_examples(serialized_examples))
+
+    feature_dim = len(model_input_raw.get_shape()) - 1
+    model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
+
+    with tf.name_scope("model"):
+      result = self.model.create_model(
+          model_input,
+          num_frames=num_frames,
+          vocab_size=self.reader.num_classes,
+          labels=labels_batch)
+
+      for variable in slim.get_model_variables():
+        tf.summary.histogram(variable.op.name, variable)
+
+      predictions = result["predictions"]
+
+      top_predictions, top_indices = tf.nn.top_k(predictions, 
+          _TOP_PREDICTIONS_IN_OUTPUT)
+    return video_id, top_indices, top_predictions
diff --git a/readers.py b/readers.py
@@ -103,6 +103,10 @@ def prepare_reader(self, filename_queue, batch_size=1024):
     reader = tf.TFRecordReader()
     _, serialized_examples = reader.read_up_to(filename_queue, batch_size)
 
+    tf.add_to_collection("serialized_examples", serialized_examples)
+    return self.prepare_serialized_examples(serialized_examples)
+
+  def prepare_serialized_examples(self, serialized_examples):
     # set the mapping from the fields to data types in the proto
     num_features = len(self.feature_names)
     assert num_features > 0, "self.feature_names is empty!"
@@ -117,6 +121,7 @@ def prepare_reader(self, filename_queue, batch_size=1024):
           [self.feature_sizes[feature_index]], tf.float32)
 
     features = tf.parse_example(serialized_examples, features=feature_map)
+
     labels = tf.sparse_to_indicator(features["labels"], self.num_classes)
     labels.set_shape([None, self.num_classes])
     concatenated_features = tf.concat([
@@ -203,6 +208,12 @@ def prepare_reader(self,
     reader = tf.TFRecordReader()
     _, serialized_example = reader.read(filename_queue)
 
+    return self.prepare_serialized_examples(serialized_example,
+        max_quantized_value, min_quantized_value)
+
+  def prepare_serialized_examples(self, serialized_example,
+      max_quantized_value=2, min_quantized_value=-2):
+
     contexts, features = tf.parse_single_sequence_example(
         serialized_example,
         context_features={"video_id": tf.FixedLenFeature(
diff --git a/train.py b/train.py