Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/rllib-algorithms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ Asynchronous Proximal Policy Optimization (APPO)
`[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ppo/appo.py>`__
We include an asynchronous variant of Proximal Policy Optimization (PPO) based on the IMPALA architecture. This is similar to IMPALA but using a surrogate policy loss with clipping. Compared to synchronous PPO, APPO is more efficient in wall-clock time due to its use of asynchronous sampling. Using a clipped loss also allows for multiple SGD passes, and therefore the potential for better sample efficiency compared to IMPALA. V-trace can also be enabled to correct for off-policy samples.

This implementation is currently *experimental*. Consider also using `PPO <rllib-algorithms.html#proximal-policy-optimization-ppo>`__ or `IMPALA <rllib-algorithms.html#importance-weighted-actor-learner-architecture-impala>`__.
APPO is not always more efficient; it is often better to simply use `PPO <rllib-algorithms.html#proximal-policy-optimization-ppo>`__ or `IMPALA <rllib-algorithms.html#importance-weighted-actor-learner-architecture-impala>`__.

Tuned examples: `PongNoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/tuned_examples/pong-appo.yaml>`__

Expand Down
2 changes: 1 addition & 1 deletion doc/source/rllib-env.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Algorithm Discrete Actions Continuous Actions Multi-Agent Recurre
A2C, A3C **Yes** `+parametric`_ **Yes** **Yes** **Yes**
PPO, APPO **Yes** `+parametric`_ **Yes** **Yes** **Yes**
PG **Yes** `+parametric`_ **Yes** **Yes** **Yes**
IMPALA **Yes** `+parametric`_ No **Yes** **Yes**
IMPALA **Yes** `+parametric`_ **Yes** **Yes** **Yes**
DQN, Rainbow **Yes** `+parametric`_ No **Yes** No
DDPG, TD3 No **Yes** **Yes** No
APEX-DQN **Yes** `+parametric`_ No **Yes** No
Expand Down
46 changes: 30 additions & 16 deletions python/ray/rllib/agents/impala/vtrace.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

import collections

from ray.rllib.models.action_dist import Categorical
from ray.rllib.utils import try_import_tf

tf = try_import_tf()
Expand All @@ -48,12 +49,15 @@
VTraceReturns = collections.namedtuple("VTraceReturns", "vs pg_advantages")


def log_probs_from_logits_and_actions(policy_logits, actions):
return multi_log_probs_from_logits_and_actions([policy_logits],
[actions])[0]
def log_probs_from_logits_and_actions(policy_logits,
actions,
dist_class=Categorical):
return multi_log_probs_from_logits_and_actions([policy_logits], [actions],
dist_class)[0]


def multi_log_probs_from_logits_and_actions(policy_logits, actions):
def multi_log_probs_from_logits_and_actions(policy_logits, actions,
dist_class):
"""Computes action log-probs from policy logits and actions.

In the notation used throughout documentation and comments, T refers to the
Expand All @@ -68,11 +72,11 @@ def multi_log_probs_from_logits_and_actions(policy_logits, actions):
...,
[T, B, ACTION_SPACE[-1]]
with un-normalized log-probabilities parameterizing a softmax policy.
actions: A list with length of ACTION_SPACE of int32
actions: A list with length of ACTION_SPACE of
tensors of shapes
[T, B],
[T, B, ...],
...,
[T, B]
[T, B, ...]
with actions.

Returns:
Expand All @@ -87,8 +91,16 @@ def multi_log_probs_from_logits_and_actions(policy_logits, actions):

log_probs = []
for i in range(len(policy_logits)):
log_probs.append(-tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=policy_logits[i], labels=actions[i]))
p_shape = tf.shape(policy_logits[i])
a_shape = tf.shape(actions[i])
policy_logits_flat = tf.reshape(policy_logits[i],
tf.concat([[-1], p_shape[2:]], axis=0))
actions_flat = tf.reshape(actions[i],
tf.concat([[-1], a_shape[2:]], axis=0))
log_probs.append(
tf.reshape(
dist_class(policy_logits_flat).logp(actions_flat),
a_shape[:2]))

return log_probs

Expand All @@ -100,6 +112,7 @@ def from_logits(behaviour_policy_logits,
rewards,
values,
bootstrap_value,
dist_class=Categorical,
clip_rho_threshold=1.0,
clip_pg_rho_threshold=1.0,
name="vtrace_from_logits"):
Expand All @@ -111,6 +124,7 @@ def from_logits(behaviour_policy_logits,
rewards,
values,
bootstrap_value,
dist_class,
clip_rho_threshold=clip_rho_threshold,
clip_pg_rho_threshold=clip_pg_rho_threshold,
name=name)
Expand All @@ -133,6 +147,7 @@ def multi_from_logits(behaviour_policy_logits,
rewards,
values,
bootstrap_value,
dist_class,
clip_rho_threshold=1.0,
clip_pg_rho_threshold=1.0,
name="vtrace_from_logits"):
Expand Down Expand Up @@ -168,11 +183,11 @@ def multi_from_logits(behaviour_policy_logits,
[T, B, ACTION_SPACE[-1]]
with un-normalized log-probabilities parameterizing the softmax target
policy.
actions: A list with length of ACTION_SPACE of int32
actions: A list with length of ACTION_SPACE of
tensors of shapes
[T, B],
[T, B, ...],
...,
[T, B]
[T, B, ...]
with actions sampled from the behaviour policy.
discounts: A float32 tensor of shape [T, B] with the discount encountered
when following the behaviour policy.
Expand All @@ -182,6 +197,7 @@ def multi_from_logits(behaviour_policy_logits,
wrt. the target policy.
bootstrap_value: A float32 of shape [B] with the value function estimate at
time T.
dist_class: action distribution class for the logits.
clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
importance weights (rho) when calculating the baseline targets (vs).
rho^bar in the paper.
Expand All @@ -208,13 +224,11 @@ def multi_from_logits(behaviour_policy_logits,
behaviour_policy_logits[i], dtype=tf.float32)
target_policy_logits[i] = tf.convert_to_tensor(
target_policy_logits[i], dtype=tf.float32)
actions[i] = tf.convert_to_tensor(actions[i], dtype=tf.int32)

# Make sure tensor ranks are as expected.
# The rest will be checked by from_action_log_probs.
behaviour_policy_logits[i].shape.assert_has_rank(3)
target_policy_logits[i].shape.assert_has_rank(3)
actions[i].shape.assert_has_rank(2)

with tf.name_scope(
name,
Expand All @@ -223,9 +237,9 @@ def multi_from_logits(behaviour_policy_logits,
discounts, rewards, values, bootstrap_value
]):
target_action_log_probs = multi_log_probs_from_logits_and_actions(
target_policy_logits, actions)
target_policy_logits, actions, dist_class)
behaviour_action_log_probs = multi_log_probs_from_logits_and_actions(
behaviour_policy_logits, actions)
behaviour_policy_logits, actions, dist_class)

log_rhos = get_log_rhos(target_action_log_probs,
behaviour_action_log_probs)
Expand Down
25 changes: 12 additions & 13 deletions python/ray/rllib/agents/impala/vtrace_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from ray.rllib.models.action_dist import MultiCategorical
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.utils.annotations import override
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.explained_variance import explained_variance
from ray.rllib.utils import try_import_tf

Expand All @@ -40,6 +39,7 @@ def __init__(self,
rewards,
values,
bootstrap_value,
dist_class,
valid_mask,
vf_loss_coeff=0.5,
entropy_coeff=0.01,
Expand All @@ -52,7 +52,7 @@ def __init__(self,
handle episode cut boundaries.

Args:
actions: An int32 tensor of shape [T, B, ACTION_SPACE].
actions: An int|float32 tensor of shape [T, B, ACTION_SPACE].
actions_logp: A float32 tensor of shape [T, B].
actions_entropy: A float32 tensor of shape [T, B].
dones: A bool tensor of shape [T, B].
Expand All @@ -70,6 +70,7 @@ def __init__(self,
rewards: A float32 tensor of shape [T, B].
values: A float32 tensor of shape [T, B].
bootstrap_value: A float32 tensor of shape [B].
dist_class: action distribution class for logits.
valid_mask: A bool tensor of valid RNN input elements (#2992).
"""

Expand All @@ -78,11 +79,12 @@ def __init__(self,
self.vtrace_returns = vtrace.multi_from_logits(
behaviour_policy_logits=behaviour_logits,
target_policy_logits=target_logits,
actions=tf.unstack(tf.cast(actions, tf.int32), axis=2),
actions=tf.unstack(actions, axis=2),
discounts=tf.to_float(~dones) * discount,
rewards=rewards,
values=values,
bootstrap_value=bootstrap_value,
dist_class=dist_class,
clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32),
clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold,
tf.float32))
Expand Down Expand Up @@ -140,30 +142,28 @@ def __init__(self,

if isinstance(action_space, gym.spaces.Discrete):
is_multidiscrete = False
actions_shape = [None]
output_hidden_shape = [action_space.n]
elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete):
is_multidiscrete = True
actions_shape = [None, len(action_space.nvec)]
output_hidden_shape = action_space.nvec.astype(np.int32)
else:
raise UnsupportedSpaceException(
"Action space {} is not supported for IMPALA.".format(
action_space))
is_multidiscrete = False
output_hidden_shape = 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does output_hidden_shape be 1 in this branch

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is the equivalent of [n] in the normal Discrete case above, which doesn't actually cause any splitting to happen below (splitting is only for multi-discrete).


# Create input placeholders
dist_class, logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
if existing_inputs:
actions, dones, behaviour_logits, rewards, observations, \
prev_actions, prev_rewards = existing_inputs[:7]
existing_state_in = existing_inputs[7:-1]
existing_seq_lens = existing_inputs[-1]
else:
actions = tf.placeholder(tf.int64, actions_shape, name="ac")
actions = ModelCatalog.get_action_placeholder(action_space)
dones = tf.placeholder(tf.bool, [None], name="dones")
rewards = tf.placeholder(tf.float32, [None], name="rewards")
behaviour_logits = tf.placeholder(
tf.float32, [None, sum(output_hidden_shape)],
name="behaviour_logits")
tf.float32, [None, logit_dim], name="behaviour_logits")
observations = tf.placeholder(
tf.float32, [None] + list(observation_space.shape))
existing_state_in = None
Expand All @@ -174,8 +174,6 @@ def __init__(self,
behaviour_logits, output_hidden_shape, axis=1)

# Setup the policy
dist_class, logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
prev_actions = ModelCatalog.get_action_placeholder(action_space)
prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
self.model = ModelCatalog.get_model(
Expand Down Expand Up @@ -261,6 +259,7 @@ def make_time_major(tensor, drop_last=False):
rewards=make_time_major(rewards, drop_last=True),
values=make_time_major(values, drop_last=True),
bootstrap_value=make_time_major(values)[-1],
dist_class=dist_class,
valid_mask=make_time_major(mask, drop_last=True),
vf_loss_coeff=self.config["vf_loss_coeff"],
entropy_coeff=self.config["entropy_coeff"],
Expand Down
17 changes: 8 additions & 9 deletions python/ray/rllib/agents/ppo/appo_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
LearningRateSchedule
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.utils.annotations import override
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.explained_variance import explained_variance
from ray.rllib.models.action_dist import MultiCategorical
from ray.rllib.evaluation.postprocessing import compute_advantages
Expand Down Expand Up @@ -94,6 +93,7 @@ def __init__(self,
rewards,
values,
bootstrap_value,
dist_class,
valid_mask,
vf_loss_coeff=0.5,
entropy_coeff=0.01,
Expand All @@ -107,18 +107,19 @@ def __init__(self,
handle episode cut boundaries.

Arguments:
actions: An int32 tensor of shape [T, B, NUM_ACTIONS].
actions: An int|float32 tensor of shape [T, B, logit_dim].
prev_actions_logp: A float32 tensor of shape [T, B].
actions_logp: A float32 tensor of shape [T, B].
action_kl: A float32 tensor of shape [T, B].
actions_entropy: A float32 tensor of shape [T, B].
dones: A bool tensor of shape [T, B].
behaviour_logits: A float32 tensor of shape [T, B, NUM_ACTIONS].
target_logits: A float32 tensor of shape [T, B, NUM_ACTIONS].
behaviour_logits: A float32 tensor of shape [T, B, logit_dim].
target_logits: A float32 tensor of shape [T, B, logit_dim].
discount: A float32 scalar.
rewards: A float32 tensor of shape [T, B].
values: A float32 tensor of shape [T, B].
bootstrap_value: A float32 tensor of shape [B].
dist_class: action distribution class for logits.
valid_mask: A bool tensor of valid RNN input elements (#2992).
"""

Expand All @@ -127,11 +128,12 @@ def __init__(self,
self.vtrace_returns = vtrace.multi_from_logits(
behaviour_policy_logits=behaviour_logits,
target_policy_logits=target_logits,
actions=tf.unstack(tf.cast(actions, tf.int32), axis=2),
actions=tf.unstack(actions, axis=2),
discounts=tf.to_float(~dones) * discount,
rewards=rewards,
values=values,
bootstrap_value=bootstrap_value,
dist_class=dist_class,
clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32),
clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold,
tf.float32))
Expand Down Expand Up @@ -218,10 +220,6 @@ def __init__(self,
elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete):
is_multidiscrete = True
output_hidden_shape = action_space.nvec.astype(np.int32)
elif self.config["vtrace"]:
raise UnsupportedSpaceException(
"Action space {} is not supported for APPO + VTrace.",
format(action_space))
else:
is_multidiscrete = False
output_hidden_shape = 1
Expand Down Expand Up @@ -365,6 +363,7 @@ def make_time_major(tensor, drop_last=False):
rewards=make_time_major(rewards, drop_last=True),
values=make_time_major(values, drop_last=True),
bootstrap_value=make_time_major(values)[-1],
dist_class=dist_class,
valid_mask=make_time_major(mask, drop_last=True),
vf_loss_coeff=self.config["vf_loss_coeff"],
entropy_coeff=self.config["entropy_coeff"],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
pendulum-appo-vt:
env: Pendulum-v0
run: APPO
stop:
episode_reward_mean: -900 # just check it learns a bit
timesteps_total: 500000
config:
num_gpus: 0
num_workers: 1
gamma: 0.95
train_batch_size: 50
vtrace: true