Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def __init__(self, observation_space, action_space, config):
[-1])
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
is_training = tf.placeholder_with_default(True, ())

# Setup the policy loss
if isinstance(action_space, gym.spaces.Box):
Expand All @@ -74,16 +73,13 @@ def __init__(self, observation_space, action_space, config):
("advantages", advantages),
("value_targets", v_target),
]
for i, ph in enumerate(self.model.state_in):
loss_in.append(("state_in_{}".format(i), ph))
self.state_in = self.model.state_in
self.state_out = self.model.state_out
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
obs_input=self.observations, action_sampler=action_dist.sample(),
loss=self.loss.total_loss, loss_inputs=loss_in,
is_training=is_training, state_inputs=self.state_in,
state_outputs=self.state_out,
state_inputs=self.state_in, state_outputs=self.state_out,
seq_lens=self.model.seq_lens,
max_seq_len=self.config["model"]["max_seq_len"])

Expand Down
2 changes: 2 additions & 0 deletions python/ray/rllib/agents/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
"gpu_options": {
"allow_growth": True,
},
"log_device_placement": False,
"device_count": {"CPU": 1},
"allow_soft_placement": True, # required by PPO multi-gpu
},
# Whether to LZ4 compress observations
Expand Down
3 changes: 1 addition & 2 deletions python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,12 +262,11 @@ def _build_action_network(p_values, stochastic, eps):
("dones", self.done_mask),
("weights", self.importance_weights),
]
self.is_training = tf.placeholder_with_default(True, ())
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
obs_input=self.cur_observations,
action_sampler=self.output_actions, loss=self.loss.total_loss,
loss_inputs=self.loss_inputs, is_training=self.is_training)
loss_inputs=self.loss_inputs)
self.sess.run(tf.global_variables_initializer())

# Note that this encompasses both the policy and Q-value networks and
Expand Down
3 changes: 1 addition & 2 deletions python/ray/rllib/agents/dqn/dqn_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,11 @@ def _build_q_network(obs):
("dones", self.done_mask),
("weights", self.importance_weights),
]
self.is_training = tf.placeholder_with_default(True, ())
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
obs_input=self.cur_observations,
action_sampler=self.output_actions, loss=self.loss.loss,
loss_inputs=self.loss_inputs, is_training=self.is_training)
loss_inputs=self.loss_inputs)
self.sess.run(tf.global_variables_initializer())

def optimizer(self):
Expand Down
8 changes: 1 addition & 7 deletions python/ray/rllib/agents/pg/pg_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,10 @@ def __init__(self, obs_space, action_space, config):
("advantages", advantages),
]

# LSTM support
for i, ph in enumerate(self.model.state_in):
loss_in.append(("state_in_{}".format(i), ph))

is_training = tf.placeholder_with_default(True, ())
TFPolicyGraph.__init__(
self, obs_space, action_space, sess, obs_input=obs,
action_sampler=action_dist.sample(), loss=loss,
loss_inputs=loss_in, is_training=is_training,
state_inputs=self.model.state_in,
loss_inputs=loss_in, state_inputs=self.model.state_in,
state_outputs=self.model.state_out,
seq_lens=self.model.seq_lens,
max_seq_len=config["model"]["max_seq_len"])
Expand Down
2 changes: 1 addition & 1 deletion python/ray/rllib/agents/ppo/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"simple_optimizer": False,
# Override model config
"model": {
# Use LSTM model (note: requires simple optimizer for now).
# Whether to use LSTM model
"use_lstm": False,
# Max seq length for LSTM training.
"max_seq_len": 20,
Expand Down
41 changes: 19 additions & 22 deletions python/ray/rllib/agents/ppo/ppo_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,10 @@ def __init__(self, observation_space, action_space,
dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space)

if existing_inputs:
self.loss_in = existing_inputs
obs_ph, value_targets_ph, adv_ph, act_ph, \
logits_ph, vf_preds_ph = [ph for _, ph in existing_inputs]
logits_ph, vf_preds_ph = existing_inputs[:6]
existing_state_in = existing_inputs[6:-1]
existing_seq_lens = existing_inputs[-1]
else:
obs_ph = tf.placeholder(
tf.float32, name="obs", shape=(None,)+observation_space.shape)
Expand All @@ -107,23 +108,20 @@ def __init__(self, observation_space, action_space,
tf.float32, name="vf_preds", shape=(None,))
value_targets_ph = tf.placeholder(
tf.float32, name="value_targets", shape=(None,))

self.loss_in = [
("obs", obs_ph),
("value_targets", value_targets_ph),
("advantages", adv_ph),
("actions", act_ph),
("logits", logits_ph),
("vf_preds", vf_preds_ph),
]

existing_state_in = None
existing_seq_lens = None

self.loss_in = [
("obs", obs_ph),
("value_targets", value_targets_ph),
("advantages", adv_ph),
("actions", act_ph),
("logits", logits_ph),
("vf_preds", vf_preds_ph),
]
self.model = ModelCatalog.get_model(
obs_ph, logit_dim, self.config["model"])

# LSTM support
if not existing_inputs:
for i, ph in enumerate(self.model.state_in):
self.loss_in.append(("state_in_{}".format(i), ph))
obs_ph, logit_dim, self.config["model"],
state_in=existing_state_in, seq_lens=existing_seq_lens)

# KL Coefficient
self.kl_coeff = tf.get_variable(
Expand Down Expand Up @@ -155,15 +153,14 @@ def __init__(self, observation_space, action_space,
clip_param=self.config["clip_param"],
vf_loss_coeff=self.config["kl_target"],
use_gae=self.config["use_gae"])
self.is_training = tf.placeholder_with_default(True, ())

TFPolicyGraph.__init__(
self, observation_space, action_space,
self.sess, obs_input=obs_ph,
action_sampler=self.sampler, loss=self.loss_obj.loss,
loss_inputs=self.loss_in, is_training=self.is_training,
state_inputs=self.model.state_in,
state_outputs=self.model.state_out, seq_lens=self.model.seq_lens)
loss_inputs=self.loss_in, state_inputs=self.model.state_in,
state_outputs=self.model.state_out, seq_lens=self.model.seq_lens,
max_seq_len=config["model"]["max_seq_len"])

self.sess.run(tf.global_variables_initializer())

Expand Down
23 changes: 12 additions & 11 deletions python/ray/rllib/evaluation/tf_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import print_function

import tensorflow as tf
import numpy as np

import ray
from ray.rllib.evaluation.policy_graph import PolicyGraph
Expand Down Expand Up @@ -36,9 +37,8 @@ class TFPolicyGraph(PolicyGraph):

def __init__(
self, observation_space, action_space, sess, obs_input,
action_sampler, loss, loss_inputs, is_training,
state_inputs=None, state_outputs=None, seq_lens=None,
max_seq_len=20):
action_sampler, loss, loss_inputs, state_inputs=None,
state_outputs=None, seq_lens=None, max_seq_len=20):
"""Initialize the policy graph.

Arguments:
Expand All @@ -54,10 +54,8 @@ def __init__(
input argument. Each placeholder name must correspond to a
SampleBatch column key returned by postprocess_trajectory(),
and has shape [BATCH_SIZE, data...].
is_training (Tensor): input placeholder for whether we are
currently training the policy.
state_inputs (list): list of RNN state output Tensors.
state_outputs (list): list of initial state values.
state_inputs (list): list of RNN state input Tensors.
state_outputs (list): list of RNN state output Tensors.
seq_lens (Tensor): placeholder for RNN sequence lengths, of shape
[NUM_SEQUENCES]. Note that NUM_SEQUENCES << BATCH_SIZE. See
models/lstm.py for more information.
Expand All @@ -72,9 +70,11 @@ def __init__(
self._loss = loss
self._loss_inputs = loss_inputs
self._loss_input_dict = dict(self._loss_inputs)
self._is_training = is_training
self._is_training = tf.placeholder_with_default(True, ())
self._state_inputs = state_inputs or []
self._state_outputs = state_outputs or []
for i, ph in enumerate(self._state_inputs):
self._loss_input_dict["state_in_{}".format(i)] = ph
self._seq_lens = seq_lens
self._max_seq_len = max_seq_len
self._optimizer = self.optimizer()
Expand All @@ -99,6 +99,8 @@ def build_compute_actions(
(self._state_inputs, state_batches)
builder.add_feed_dict(self.extra_compute_action_feed_dict())
builder.add_feed_dict({self._obs_input: obs_batch})
if state_batches:
builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
builder.add_feed_dict({self._is_training: is_training})
builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
fetches = builder.add_fetches(
Expand All @@ -123,10 +125,9 @@ def _get_loss_inputs_dict(self, batch):
return feed_dict

# RNN case
feature_keys = [
k for k, v in self._loss_inputs if not k.startswith("state_in_")]
feature_keys = [k for k, v in self._loss_inputs]
state_keys = [
k for k, v in self._loss_inputs if k.startswith("state_in_")]
"state_in_{}".format(i) for i in range(len(self._state_inputs))]
feature_sequences, initial_states, seq_lens = chop_into_sequences(
batch["t"],
[batch[k] for k in feature_keys],
Expand Down
20 changes: 13 additions & 7 deletions python/ray/rllib/models/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,41 +138,47 @@ def get_action_placeholder(action_space):
" not supported".format(action_space))

@staticmethod
def get_model(inputs, num_outputs, options=None):
def get_model(
inputs, num_outputs, options=None, state_in=None, seq_lens=None):
"""Returns a suitable model conforming to given input and output specs.

Args:
inputs (Tensor): The input tensor to the model.
num_outputs (int): The size of the output vector of the model.
options (dict): Optional args to pass to the model constructor.
state_in (list): Optional RNN state in tensors.
seq_in (Tensor): Optional RNN sequence length tensor.

Returns:
model (Model): Neural network model.
"""

options = options or {}
model = ModelCatalog._get_model(inputs, num_outputs, options)
model = ModelCatalog._get_model(
inputs, num_outputs, options, state_in, seq_lens)

if options.get("use_lstm"):
model = LSTM(model.last_layer, num_outputs, options)
model = LSTM(
model.last_layer, num_outputs, options, state_in, seq_lens)

return model

@staticmethod
def _get_model(inputs, num_outputs, options):
def _get_model(inputs, num_outputs, options, state_in, seq_lens):
if "custom_model" in options:
model = options["custom_model"]
print("Using custom model {}".format(model))
return _global_registry.get(RLLIB_MODEL, model)(
inputs, num_outputs, options)
inputs, num_outputs, options,
state_in=state_in, seq_lens=seq_lens)

obs_rank = len(inputs.shape) - 1

# num_outputs > 1 used to avoid hitting this with the value function
if isinstance(options.get("custom_options", {}).get(
"multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
return MultiAgentFullyConnectedNetwork(inputs,
num_outputs, options)
return MultiAgentFullyConnectedNetwork(
inputs, num_outputs, options)

if obs_rank > 1:
return VisionNetwork(inputs, num_outputs, options)
Expand Down
13 changes: 9 additions & 4 deletions python/ray/rllib/models/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def add_time_dimension(padded_inputs, seq_lens):
# Sequence lengths have to be specified for LSTM batch inputs. The
# input batch must be padded to the max seq length given here. That is,
# batch_size == len(seq_lens) * max(seq_lens)
max_seq_len = tf.reduce_max(seq_lens)
padded_batch_size = tf.shape(padded_inputs)[0]
max_seq_len = padded_batch_size // tf.shape(seq_lens)[0]

# Dynamically reshape the padded batch to introduce a time dimension.
new_batch_size = padded_batch_size // max_seq_len
Expand Down Expand Up @@ -155,9 +155,14 @@ def _build_layers(self, inputs, num_outputs, options):
np.zeros(lstm.state_size.h, np.float32)]

# Setup LSTM inputs
c_in = tf.placeholder(tf.float32, [None, lstm.state_size.c], name="c")
h_in = tf.placeholder(tf.float32, [None, lstm.state_size.h], name="h")
self.state_in = [c_in, h_in]
if self.state_in:
c_in, h_in = self.state_in
else:
c_in = tf.placeholder(
tf.float32, [None, lstm.state_size.c], name="c")
h_in = tf.placeholder(
tf.float32, [None, lstm.state_size.h], name="h")
self.state_in = [c_in, h_in]

# Setup LSTM outputs
if use_tf100_api:
Expand Down
14 changes: 8 additions & 6 deletions python/ray/rllib/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,19 @@ class Model(object):
a scale parameter (like a standard deviation).
"""

def __init__(self, inputs, num_outputs, options):
def __init__(
self, inputs, num_outputs, options, state_in=None, seq_lens=None):
self.inputs = inputs

# Default attribute values for the non-RNN case
self.state_init = []
self.state_in = []
self.state_in = state_in or []
self.state_out = []
self.seq_lens = tf.placeholder_with_default(
tf.ones( # reshape needed for older tf versions
tf.reshape(tf.shape(inputs)[0], [1]), dtype=tf.int32),
[None], name="seq_lens")
if seq_lens is not None:
self.seq_lens = seq_lens
else:
self.seq_lens = tf.placeholder(
dtype=tf.int32, shape=[None], name="seq_lens")

if options.get("free_log_std", False):
assert num_outputs % 2 == 0
Expand Down
Loading