chicm-ms
diff --git a/‎README.md
+36-3 b/‎README.md
+36-3
diff --git a/‎cloudml-4gpu.yaml
+6 b/‎cloudml-4gpu.yaml
+6
diff --git a/‎cloudml-gpu.yaml
+1-1 b/‎cloudml-gpu.yaml
+1-1
diff --git a/‎eval.py
+4-2 b/‎eval.py
+4-2
diff --git a/‎frame_level_models.py
+193 b/‎frame_level_models.py
+193
diff --git a/‎inference.py
-1 b/‎inference.py
-1
@@ -4,8 +4,9 @@ This repo contains starter code for training and evaluating machine learning
 models over the [YouTube-8M](https://research.google.com/youtube8m/) dataset.
 The code gives an end-to-end working example for reading the dataset, training a
 TensorFlow model, and evaluating the performance of the model. Out of the box,
-you can train a logistic classification model over either frame-level or
-video-level features. The code can be extended to train more complex models.
+you can train several [model architectures](#overview-of-models) over either
+frame-level or video-level features. The code can easily be extended to train
+your own custom-defined models.
 
 It is possible to train and evaluate on YouTube-8M in two ways: on your own
 machine, or on Google Cloud. This README provides instructions for both.
@@ -25,6 +26,9 @@ machine, or on Google Cloud. This README provides instructions for both.
    * [Using Frame-Level Features](#using-frame-level-features-1)
    * [Using Audio Features](#using-audio-features-1)
    * [Ground-Truth Label Files](#ground-truth-label-files)
+* [Overview of Models](#overview-of-models)
+   * [Video-Level Models](#video-level-models)
+   * [Frame-Level Models](#frame-level-models)
 * [Overview of Files](#overview-of-files)
    * [Training](#training)
    * [Evaluation](#evaluation)
@@ -85,6 +89,7 @@ JOB_NAME=yt8m_eval_$(date +%Y%m%d_%H%M%S); gcloud --verbosity=debug beta ml jobs
 submit training $JOB_NAME \
 --package-path=youtube-8m --module-name=youtube-8m.eval \
 --staging-bucket=$BUCKET_NAME --region=us-central1 \
+--config=youtube-8m/cloudml-gpu.yaml \
 -- --eval_data_pattern='gs://youtube8m-ml/1/video_level/validate/validate*.tfrecord' \
 --train_dir=$BUCKET_NAME/${JOB_TO_EVAL}
 ```
@@ -97,6 +102,7 @@ JOB_NAME=yt8m_inference_$(date +%Y%m%d_%H%M%S); gcloud --verbosity=debug beta ml
 submit training $JOB_NAME \
 --package-path=youtube-8m --module-name=youtube-8m.inference \
 --staging-bucket=$BUCKET_NAME --region=us-central1 \
+--config=youtube-8m/cloudml-gpu.yaml \
 -- --input_data_pattern='gs://youtube8m-ml/1/video_level/test/test*.tfrecord' \
 --train_dir=$BUCKET_NAME/${JOB_TO_EVAL} \
 --output_file=$BUCKET_NAME/${JOB_TO_EVAL}/predictions.csv
@@ -314,12 +320,39 @@ id 'VIDEO_ID' and two lables 'LABLE1' and 'LABEL2' we store the following line:
 VIDEO_ID,LABEL1 LABEL2
 ```
 
+## Overview of Models
+
+This sample code contains implementations of three of the models given in the
+[YouTube-8M technical report](https://arxiv.org/abs/1609.08675).
+
+### Video-Level Models
+*   `LogisticModel`: Linear projection of the output features into the label
+                     space, followed by a sigmoid function to convert logit
+                     values to probabilities.
+*   `MoeModel`: A per-class softmax distribution over a configurable number of
+                logistic classifiers. One of the classifiers in the mixture
+                is not trained, and always predicts 0.
+
+### Frame-Level Models
+* `DBoFModel`: Projects the features for each frame into a higher dimensional
+               'clustering' space, pools across frames in that space, and then
+               uses a video-level model to classify the now aggregated features.
+* `FrameLevelLogisticModel`: Equivalent to 'LogisticModel', but performs
+                             average-pooling on the fly over frame-level
+                             features rather than using pre-aggregated features.
+
 ## Overview of Files
 
 ### Training
 *   `train.py`: The primary script for training models.
 *   `losses.py`: Contains definitions for loss functions.
-*   `models.py`: Contains definitions for models.
+*   `models.py`: Contains the base class for defining a model.
+*   `video_level_models.py`: Contains definitions for models that take
+                             aggregated features as input.
+*   `frame_level_models.py`: Contains definitions for models that take frame-
+                             level features as input.
+*   `model_util.py`: Contains functions that are of general utility for
+                     implementing models.
 *   `readers.py`: Contains definitions for the Video dataset and Frame
                   dataset readers.
 
 
@@ -0,0 +1,6 @@
+trainingInput:
+  scaleTier: CUSTOM
+  # standard_gpu provides 1 GPU. Change to complex_model_m_gpu for 4 GPUs
+  masterType: complex_model_m_gpu
+  args: ["--num_gpus", "4"]
+  runtimeVersion: "1.0"
@@ -2,4 +2,4 @@ trainingInput:
   scaleTier: CUSTOM
   # standard_gpu provides 1 GPU. Change to complex_model_m_gpu for 4 GPUs
   masterType: standard_gpu
-  #  runtimeVersion: "0.12"
+  runtimeVersion: "1.0"
@@ -17,7 +17,8 @@
 
 import eval_util
 import losses
-import models
+import frame_level_models
+import video_level_models
 import readers
 import tensorflow as tf
 from tensorflow import app
@@ -282,7 +283,8 @@ def evaluate():
       reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names,
                                                    feature_sizes=feature_sizes)
 
-    model = find_class_by_name(FLAGS.model, [models])()
+    model = find_class_by_name(FLAGS.model,
+        [frame_level_models, video_level_models])()
     label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])()
 
     if FLAGS.eval_data_pattern is "":
 
@@ -0,0 +1,193 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains a collection of models which operate on variable-length sequences.
+"""
+import math
+
+import models
+import video_level_models
+import tensorflow as tf
+import model_utils as utils
+
+import tensorflow.contrib.slim as slim
+from tensorflow import flags
+
+FLAGS = flags.FLAGS
+flags.DEFINE_integer("iterations", 30,
+                     "Number of frames per batch for DBoF.")
+flags.DEFINE_bool("dbof_add_batch_norm", True,
+                  "Adds batch normalization to the DBoF model.")
+flags.DEFINE_bool(
+    "sample_random_frames", True,
+    "If true samples random frames (for frame level models). If false, a random"
+    "sequence of frames is sampled instead.")
+flags.DEFINE_integer("dbof_cluster_size", 8192,
+                     "Number of units in the DBoF cluster layer.")
+flags.DEFINE_integer("dbof_hidden_size", 1024,
+                     "Number of units in the DBoF hidden layer.")
+flags.DEFINE_string("dbof_pooling_method", "max",
+                    "The pooling method used in the DBoF cluster layer. "
+                    "Choices are 'average' and 'max'.")
+flags.DEFINE_string("video_level_classifier_model", "MoeModel",
+                    "Some Frame-Level models can be decomposed into a "
+                    "generalized pooling operation followed by a "
+                    "classifier layer")
+
+class FrameLevelLogisticModel(models.BaseModel):
+
+  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
+    """Creates a model which uses a logistic classifier over the average of the
+    frame-level features.
+
+    This class is intended to be an example for implementors of frame level
+    models. If you want to train a model over averaged features it is more
+    efficient to average them beforehand rather than on the fly.
+
+    Args:
+      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
+                   input features.
+      vocab_size: The number of classes in the dataset.
+      num_frames: A vector of length 'batch' which indicates the number of
+           frames for each video (before padding).
+
+    Returns:
+      A dictionary with a tensor containing the probability predictions of the
+      model in the 'predictions' key. The dimensions of the tensor are
+      'batch_size' x 'num_classes'.
+    """
+    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
+    feature_size = model_input.get_shape().as_list()[2]
+
+    denominators = tf.reshape(
+        tf.tile(num_frames, [1, feature_size]), [-1, feature_size])
+    avg_pooled = tf.reduce_sum(model_input,
+                               axis=[1]) / denominators
+
+    output = slim.fully_connected(
+        avg_pooled, vocab_size, activation_fn=tf.nn.sigmoid,
+        weights_regularizer=slim.l2_regularizer(0.01))
+    return {"predictions": output}
+
+class DBoFModel(models.BaseModel):
+  """Creates a Deep Bag of Frames model.
+
+  The model projects the features for each frame into a higher dimensional
+  'clustering' space, pools across frames in that space, and then
+  uses a configurable video-level model to classify the now aggregated features.
+
+  The model will randomly sample either frames or sequences of frames during
+  training to speed up convergence.
+
+  Args:
+    model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
+                 input features.
+    vocab_size: The number of classes in the dataset.
+    num_frames: A vector of length 'batch' which indicates the number of
+         frames for each video (before padding).
+
+  Returns:
+    A dictionary with a tensor containing the probability predictions of the
+    model in the 'predictions' key. The dimensions of the tensor are
+    'batch_size' x 'num_classes'.
+  """
+
+  def create_model(self,
+                   model_input,
+                   vocab_size,
+                   num_frames,
+                   iterations=None,
+                   add_batch_norm=None,
+                   sample_random_frames=None,
+                   cluster_size=None,
+                   hidden_size=None,
+                   is_training=True,
+                   **unused_params):
+    iterations = iterations or FLAGS.iterations
+    add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
+    random_frames = sample_random_frames or FLAGS.sample_random_frames
+    cluster_size = cluster_size or FLAGS.dbof_cluster_size
+    hidden1_size = hidden_size or FLAGS.dbof_hidden_size
+
+    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
+    if random_frames:
+      model_input = utils.SampleRandomFrames(model_input, num_frames,
+                                             iterations)
+    else:
+      model_input = utils.SampleRandomSequence(model_input, num_frames,
+                                               iterations)
+    max_frames = model_input.get_shape().as_list()[1]
+    feature_size = model_input.get_shape().as_list()[2]
+    reshaped_input = tf.reshape(model_input, [-1, feature_size])
+    tf.summary.histogram("input_hist", reshaped_input)
+
+    if add_batch_norm:
+      reshaped_input = slim.batch_norm(
+          reshaped_input,
+          center=True,
+          scale=True,
+          is_training=is_training,
+          scope="input_bn")
+
+    cluster_weights = tf.Variable(tf.random_normal(
+        [feature_size, cluster_size],
+        stddev=1 / math.sqrt(feature_size)))
+    tf.summary.histogram("cluster_weights", cluster_weights)
+    activation = tf.matmul(reshaped_input, cluster_weights)
+    if add_batch_norm:
+      activation = slim.batch_norm(
+          activation,
+          center=True,
+          scale=True,
+          is_training=is_training,
+          scope="cluster_bn")
+    else:
+      cluster_biases = tf.Variable(
+          tf.random_normal(
+              [cluster_size], stddev=1 / math.sqrt(feature_size)))
+      tf.summary.histogram("cluster_biases", cluster_biases)
+      activation += cluster_biases
+    activation = tf.nn.relu6(activation)
+    tf.summary.histogram("cluster_output", activation)
+
+    activation = tf.reshape(activation, [-1, max_frames, cluster_size])
+    activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)
+
+    hidden1_weights = tf.Variable(tf.random_normal(
+        [cluster_size, hidden1_size],
+        stddev=1 / math.sqrt(cluster_size)))
+    tf.summary.histogram("hidden1_weights", hidden1_weights)
+    activation = tf.matmul(activation, hidden1_weights)
+    if add_batch_norm:
+      activation = slim.batch_norm(
+          activation,
+          center=True,
+          scale=True,
+          is_training=is_training,
+          scope="hidden1_bn")
+    else:
+      hidden1_biases = tf.Variable(
+          tf.random_normal(
+              [hidden1_size], stddev=0.01))
+      tf.summary.histogram("hidden1_biases", hidden1_biases)
+      activation += hidden1_biases
+    activation = tf.nn.relu6(activation)
+    tf.summary.histogram("hidden1_output", activation)
+
+    aggregated_model = getattr(video_level_models,
+                               FLAGS.video_level_classifier_model)
+    return aggregated_model().create_model(
+        model_input=activation,
+        vocab_size=vocab_size,
+        **unused_params)
@@ -29,7 +29,6 @@
 import losses
 import readers
 import utils
-import models
 
 FLAGS = flags.FLAGS