ray-project · ericl · Jul 17, 2018 · Jul 7, 2018 · Jul 7, 2018 · Jul 7, 2018
@@ -49,7 +49,6 @@ def __init__(self, observation_space, action_space, config):
             [-1])
         self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           tf.get_variable_scope().name)
-        is_training = tf.placeholder_with_default(True, ())
 
         # Setup the policy loss
         if isinstance(action_space, gym.spaces.Box):
@@ -74,16 +73,13 @@ def __init__(self, observation_space, action_space, config):
             ("advantages", advantages),
             ("value_targets", v_target),
         ]
-        for i, ph in enumerate(self.model.state_in):
-            loss_in.append(("state_in_{}".format(i), ph))
         self.state_in = self.model.state_in
         self.state_out = self.model.state_out
         TFPolicyGraph.__init__(
             self, observation_space, action_space, self.sess,
             obs_input=self.observations, action_sampler=action_dist.sample(),
             loss=self.loss.total_loss, loss_inputs=loss_in,
-            is_training=is_training, state_inputs=self.state_in,
-            state_outputs=self.state_out,
+            state_inputs=self.state_in, state_outputs=self.state_out,
             seq_lens=self.model.seq_lens,
             max_seq_len=self.config["model"]["max_seq_len"])
 

@@ -46,6 +46,8 @@
         "gpu_options": {
             "allow_growth": True,
         },
+        "log_device_placement": False,
+        "device_count": {"CPU": 1},
         "allow_soft_placement": True,  # required by PPO multi-gpu
     },
     # Whether to LZ4 compress observations

@@ -262,12 +262,11 @@ def _build_action_network(p_values, stochastic, eps):
             ("dones", self.done_mask),
             ("weights", self.importance_weights),
         ]
-        self.is_training = tf.placeholder_with_default(True, ())
         TFPolicyGraph.__init__(
             self, observation_space, action_space, self.sess,
             obs_input=self.cur_observations,
             action_sampler=self.output_actions, loss=self.loss.total_loss,
-            loss_inputs=self.loss_inputs, is_training=self.is_training)
+            loss_inputs=self.loss_inputs)
         self.sess.run(tf.global_variables_initializer())
 
         # Note that this encompasses both the policy and Q-value networks and

@@ -171,12 +171,11 @@ def _build_q_network(obs):
             ("dones", self.done_mask),
             ("weights", self.importance_weights),
         ]
-        self.is_training = tf.placeholder_with_default(True, ())
         TFPolicyGraph.__init__(
             self, observation_space, action_space, self.sess,
             obs_input=self.cur_observations,
             action_sampler=self.output_actions, loss=self.loss.loss,
-            loss_inputs=self.loss_inputs, is_training=self.is_training)
+            loss_inputs=self.loss_inputs)
         self.sess.run(tf.global_variables_initializer())
 
     def optimizer(self):

@@ -41,16 +41,10 @@ def __init__(self, obs_space, action_space, config):
             ("advantages", advantages),
         ]
 
-        # LSTM support
-        for i, ph in enumerate(self.model.state_in):
-            loss_in.append(("state_in_{}".format(i), ph))
-
-        is_training = tf.placeholder_with_default(True, ())
         TFPolicyGraph.__init__(
             self, obs_space, action_space, sess, obs_input=obs,
             action_sampler=action_dist.sample(), loss=loss,
-            loss_inputs=loss_in, is_training=is_training,
-            state_inputs=self.model.state_in,
+            loss_inputs=loss_in, state_inputs=self.model.state_in,
             state_outputs=self.model.state_out,
             seq_lens=self.model.seq_lens,
             max_seq_len=config["model"]["max_seq_len"])

@@ -50,7 +50,7 @@
     "simple_optimizer": False,
     # Override model config
     "model": {
-        # Use LSTM model (note: requires simple optimizer for now).
+        # Whether to use LSTM model
         "use_lstm": False,
         # Max seq length for LSTM training.
         "max_seq_len": 20,

@@ -92,9 +92,10 @@ def __init__(self, observation_space, action_space,
         dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space)
 
         if existing_inputs:
-            self.loss_in = existing_inputs
             obs_ph, value_targets_ph, adv_ph, act_ph, \
-                logits_ph, vf_preds_ph = [ph for _, ph in existing_inputs]
+                logits_ph, vf_preds_ph = existing_inputs[:6]
+            existing_state_in = existing_inputs[6:-1]
+            existing_seq_lens = existing_inputs[-1]
         else:
             obs_ph = tf.placeholder(
                 tf.float32, name="obs", shape=(None,)+observation_space.shape)
@@ -107,23 +108,20 @@ def __init__(self, observation_space, action_space,
                 tf.float32, name="vf_preds", shape=(None,))
             value_targets_ph = tf.placeholder(
                 tf.float32, name="value_targets", shape=(None,))
-
-            self.loss_in = [
-                ("obs", obs_ph),
-                ("value_targets", value_targets_ph),
-                ("advantages", adv_ph),
-                ("actions", act_ph),
-                ("logits", logits_ph),
-                ("vf_preds", vf_preds_ph),
-            ]
-
+            existing_state_in = None
+            existing_seq_lens = None
+
+        self.loss_in = [
+            ("obs", obs_ph),
+            ("value_targets", value_targets_ph),
+            ("advantages", adv_ph),
+            ("actions", act_ph),
+            ("logits", logits_ph),
+            ("vf_preds", vf_preds_ph),
+        ]
         self.model = ModelCatalog.get_model(
-            obs_ph, logit_dim, self.config["model"])
-
-        # LSTM support
-        if not existing_inputs:
-            for i, ph in enumerate(self.model.state_in):
-                self.loss_in.append(("state_in_{}".format(i), ph))
+            obs_ph, logit_dim, self.config["model"],
+            state_in=existing_state_in, seq_lens=existing_seq_lens)
 
         # KL Coefficient
         self.kl_coeff = tf.get_variable(
@@ -155,15 +153,14 @@ def __init__(self, observation_space, action_space,
             clip_param=self.config["clip_param"],
             vf_loss_coeff=self.config["kl_target"],
             use_gae=self.config["use_gae"])
-        self.is_training = tf.placeholder_with_default(True, ())
 
         TFPolicyGraph.__init__(
             self, observation_space, action_space,
             self.sess, obs_input=obs_ph,
             action_sampler=self.sampler, loss=self.loss_obj.loss,
-            loss_inputs=self.loss_in, is_training=self.is_training,
-            state_inputs=self.model.state_in,
-            state_outputs=self.model.state_out, seq_lens=self.model.seq_lens)
+            loss_inputs=self.loss_in, state_inputs=self.model.state_in,
+            state_outputs=self.model.state_out, seq_lens=self.model.seq_lens,
+            max_seq_len=config["model"]["max_seq_len"])
 
         self.sess.run(tf.global_variables_initializer())
 

@@ -3,6 +3,7 @@
 from __future__ import print_function
 
 import tensorflow as tf
+import numpy as np
 
 import ray
 from ray.rllib.evaluation.policy_graph import PolicyGraph
@@ -36,9 +37,8 @@ class TFPolicyGraph(PolicyGraph):
 
     def __init__(
             self, observation_space, action_space, sess, obs_input,
-            action_sampler, loss, loss_inputs, is_training,
-            state_inputs=None, state_outputs=None, seq_lens=None,
-            max_seq_len=20):
+            action_sampler, loss, loss_inputs, state_inputs=None,
+            state_outputs=None, seq_lens=None, max_seq_len=20):
         """Initialize the policy graph.
 
         Arguments:
@@ -54,10 +54,8 @@ def __init__(
                 input argument. Each placeholder name must correspond to a
                 SampleBatch column key returned by postprocess_trajectory(),
                 and has shape [BATCH_SIZE, data...].
-            is_training (Tensor): input placeholder for whether we are
-                currently training the policy.
-            state_inputs (list): list of RNN state output Tensors.
-            state_outputs (list): list of initial state values.
+            state_inputs (list): list of RNN state input Tensors.
+            state_outputs (list): list of RNN state output Tensors.
             seq_lens (Tensor): placeholder for RNN sequence lengths, of shape
                 [NUM_SEQUENCES]. Note that NUM_SEQUENCES << BATCH_SIZE. See
                 models/lstm.py for more information.
@@ -72,9 +70,11 @@ def __init__(
         self._loss = loss
         self._loss_inputs = loss_inputs
         self._loss_input_dict = dict(self._loss_inputs)
-        self._is_training = is_training
+        self._is_training = tf.placeholder_with_default(True, ())
         self._state_inputs = state_inputs or []
         self._state_outputs = state_outputs or []
+        for i, ph in enumerate(self._state_inputs):
+            self._loss_input_dict["state_in_{}".format(i)] = ph
         self._seq_lens = seq_lens
         self._max_seq_len = max_seq_len
         self._optimizer = self.optimizer()
@@ -99,6 +99,8 @@ def build_compute_actions(
             (self._state_inputs, state_batches)
         builder.add_feed_dict(self.extra_compute_action_feed_dict())
         builder.add_feed_dict({self._obs_input: obs_batch})
+        if state_batches:
+            builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
         builder.add_feed_dict({self._is_training: is_training})
         builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
         fetches = builder.add_fetches(
@@ -123,10 +125,9 @@ def _get_loss_inputs_dict(self, batch):
             return feed_dict
 
         # RNN case
-        feature_keys = [
-            k for k, v in self._loss_inputs if not k.startswith("state_in_")]
+        feature_keys = [k for k, v in self._loss_inputs]
         state_keys = [
-            k for k, v in self._loss_inputs if k.startswith("state_in_")]
+            "state_in_{}".format(i) for i in range(len(self._state_inputs))]
         feature_sequences, initial_states, seq_lens = chop_into_sequences(
             batch["t"],
             [batch[k] for k in feature_keys],

@@ -138,41 +138,47 @@ def get_action_placeholder(action_space):
                                       " not supported".format(action_space))
 
     @staticmethod
-    def get_model(inputs, num_outputs, options=None):
+    def get_model(
+            inputs, num_outputs, options=None, state_in=None, seq_lens=None):
         """Returns a suitable model conforming to given input and output specs.
 
         Args:
             inputs (Tensor): The input tensor to the model.
             num_outputs (int): The size of the output vector of the model.
             options (dict): Optional args to pass to the model constructor.
+            state_in (list): Optional RNN state in tensors.
+            seq_in (Tensor): Optional RNN sequence length tensor.
 
         Returns:
             model (Model): Neural network model.
         """
 
         options = options or {}
-        model = ModelCatalog._get_model(inputs, num_outputs, options)
+        model = ModelCatalog._get_model(
+            inputs, num_outputs, options, state_in, seq_lens)
 
         if options.get("use_lstm"):
-            model = LSTM(model.last_layer, num_outputs, options)
+            model = LSTM(
+                model.last_layer, num_outputs, options, state_in, seq_lens)
 
         return model
 
     @staticmethod
-    def _get_model(inputs, num_outputs, options):
+    def _get_model(inputs, num_outputs, options, state_in, seq_lens):
         if "custom_model" in options:
             model = options["custom_model"]
             print("Using custom model {}".format(model))
             return _global_registry.get(RLLIB_MODEL, model)(
-                inputs, num_outputs, options)
+                inputs, num_outputs, options,
+                state_in=state_in, seq_lens=seq_lens)
 
         obs_rank = len(inputs.shape) - 1
 
         # num_outputs > 1 used to avoid hitting this with the value function
         if isinstance(options.get("custom_options", {}).get(
           "multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
-            return MultiAgentFullyConnectedNetwork(inputs,
-                                                   num_outputs, options)
+            return MultiAgentFullyConnectedNetwork(
+                inputs, num_outputs, options)
 
         if obs_rank > 1:
             return VisionNetwork(inputs, num_outputs, options)

@@ -41,8 +41,8 @@ def add_time_dimension(padded_inputs, seq_lens):
     # Sequence lengths have to be specified for LSTM batch inputs. The
     # input batch must be padded to the max seq length given here. That is,
     # batch_size == len(seq_lens) * max(seq_lens)
-    max_seq_len = tf.reduce_max(seq_lens)
     padded_batch_size = tf.shape(padded_inputs)[0]
+    max_seq_len = padded_batch_size // tf.shape(seq_lens)[0]
 
     # Dynamically reshape the padded batch to introduce a time dimension.
     new_batch_size = padded_batch_size // max_seq_len
@@ -155,9 +155,14 @@ def _build_layers(self, inputs, num_outputs, options):
             np.zeros(lstm.state_size.h, np.float32)]
 
         # Setup LSTM inputs
-        c_in = tf.placeholder(tf.float32, [None, lstm.state_size.c], name="c")
-        h_in = tf.placeholder(tf.float32, [None, lstm.state_size.h], name="h")
-        self.state_in = [c_in, h_in]
+        if self.state_in:
+            c_in, h_in = self.state_in
+        else:
+            c_in = tf.placeholder(
+                tf.float32, [None, lstm.state_size.c], name="c")
+            h_in = tf.placeholder(
+                tf.float32, [None, lstm.state_size.h], name="h")
+            self.state_in = [c_in, h_in]
 
         # Setup LSTM outputs
         if use_tf100_api:

@@ -37,17 +37,19 @@ class Model(object):
     a scale parameter (like a standard deviation).
     """
 
-    def __init__(self, inputs, num_outputs, options):
+    def __init__(
+            self, inputs, num_outputs, options, state_in=None, seq_lens=None):
         self.inputs = inputs
 
         # Default attribute values for the non-RNN case
         self.state_init = []
-        self.state_in = []
+        self.state_in = state_in or []
         self.state_out = []
-        self.seq_lens = tf.placeholder_with_default(
-            tf.ones(  # reshape needed for older tf versions
-                tf.reshape(tf.shape(inputs)[0], [1]), dtype=tf.int32),
-            [None], name="seq_lens")
+        if seq_lens is not None:
+            self.seq_lens = seq_lens
+        else:
+            self.seq_lens = tf.placeholder(
+                dtype=tf.int32, shape=[None], name="seq_lens")
 
         if options.get("free_log_std", False):
             assert num_outputs % 2 == 0