From 556985a1206b988a58819cd6a850326ae9b709c1 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 28 Jul 2019 18:24:18 -0700 Subject: [PATCH 01/33] wip --- python/ray/rllib/agents/impala/vtrace.py | 10 +- .../ray/rllib/agents/impala/vtrace_policy.py | 8 +- python/ray/rllib/agents/ppo/ppo_policy.py | 20 +- .../examples/autoregressive_action_dist.py | 202 ++++++++++++++++++ python/ray/rllib/models/action_dist.py | 5 + python/ray/rllib/models/modelv2.py | 3 + python/ray/rllib/models/tf/tf_action_dist.py | 4 +- python/ray/rllib/policy/dynamic_tf_policy.py | 9 + python/ray/rllib/policy/tf_policy.py | 4 +- 9 files changed, 251 insertions(+), 14 deletions(-) create mode 100644 python/ray/rllib/examples/autoregressive_action_dist.py diff --git a/python/ray/rllib/agents/impala/vtrace.py b/python/ray/rllib/agents/impala/vtrace.py index 0064faa16e92a..08d49da241cd6 100644 --- a/python/ray/rllib/agents/impala/vtrace.py +++ b/python/ray/rllib/agents/impala/vtrace.py @@ -146,6 +146,7 @@ def multi_from_logits(behaviour_policy_logits, values, bootstrap_value, dist_class, + behaviour_action_prob=None, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, name="vtrace_from_logits"): @@ -196,6 +197,7 @@ def multi_from_logits(behaviour_policy_logits, bootstrap_value: A float32 of shape [B] with the value function estimate at time T. dist_class: action distribution class for the logits. + behaviour_action_prob: precalculated values of the behaviour actions clip_rho_threshold: A scalar float32 tensor with the clipping threshold for importance weights (rho) when calculating the baseline targets (vs). rho^bar in the paper. @@ -236,8 +238,12 @@ def multi_from_logits(behaviour_policy_logits, ]): target_action_log_probs = multi_log_probs_from_logits_and_actions( target_policy_logits, actions, dist_class) - behaviour_action_log_probs = multi_log_probs_from_logits_and_actions( - behaviour_policy_logits, actions, dist_class) + if len(behaviour_policy_logits) > 1 or behaviour_action_prob is None: + behaviour_action_log_probs = ( + multi_log_probs_from_logits_and_actions( + behaviour_policy_logits, actions, dist_class)) + else: + behaviour_action_log_probs = tf.log(behaviour_action_prob) log_rhos = get_log_rhos(target_action_log_probs, behaviour_action_log_probs) diff --git a/python/ray/rllib/agents/impala/vtrace_policy.py b/python/ray/rllib/agents/impala/vtrace_policy.py index ab28e3f49283f..0f51c3f426f2d 100644 --- a/python/ray/rllib/agents/impala/vtrace_policy.py +++ b/python/ray/rllib/agents/impala/vtrace_policy.py @@ -16,7 +16,7 @@ from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.policy.tf_policy import LearningRateSchedule, \ - EntropyCoeffSchedule + EntropyCoeffSchedule, ACTION_PROB from ray.rllib.utils.explained_variance import explained_variance from ray.rllib.utils import try_import_tf @@ -33,6 +33,7 @@ def __init__(self, actions_logp, actions_entropy, dones, + behaviour_action_prob, behaviour_logits, target_logits, discount, @@ -56,6 +57,7 @@ def __init__(self, actions_logp: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. dones: A bool tensor of shape [T, B]. + behaviour_action_prob: Tensor of shape [T, B]. behaviour_logits: A list with length of ACTION_SPACE of float32 tensors of shapes [T, B, ACTION_SPACE[0]], @@ -77,6 +79,7 @@ def __init__(self, # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( + behaviour_action_prob=behaviour_action_prob, behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, actions=tf.unstack(actions, axis=2), @@ -161,6 +164,7 @@ def make_time_major(*args, **kw): actions = batch_tensors[SampleBatch.ACTIONS] dones = batch_tensors[SampleBatch.DONES] rewards = batch_tensors[SampleBatch.REWARDS] + behaviour_action_prob = batch_tensors[ACTION_PROB] behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS] unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) @@ -187,6 +191,8 @@ def make_time_major(*args, **kw): actions_entropy=make_time_major( action_dist.multi_entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), + behaviour_action_prob=make_time_major( + behaviour_action_prob, drop_last=True), behaviour_logits=make_time_major( unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py index e87b106fa2c97..c7e0b04a5a89d 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy.py +++ b/python/ray/rllib/agents/ppo/ppo_policy.py @@ -9,9 +9,8 @@ Postprocessing from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy import LearningRateSchedule, \ - EntropyCoeffSchedule + EntropyCoeffSchedule, ACTION_PROB from ray.rllib.policy.tf_policy_template import build_tf_policy -from ray.rllib.models.catalog import ModelCatalog from ray.rllib.utils.explained_variance import explained_variance from ray.rllib.utils import try_import_tf @@ -26,10 +25,12 @@ class PPOLoss(object): def __init__(self, action_space, + dist_cls, value_targets, advantages, actions, - logits, + prev_logits, + prev_actions_logp, vf_preds, curr_action_dist, value_fn, @@ -44,13 +45,16 @@ def __init__(self, Arguments: action_space: Environment observation space specification. + dist_cls: action distribution class for logits. value_targets (Placeholder): Placeholder for target values; used for GAE. actions (Placeholder): Placeholder for actions taken from previous model evaluation. advantages (Placeholder): Placeholder for calculated advantages from previous model evaluation. - logits (Placeholder): Placeholder for logits output from + prev_logits (Placeholder): Placeholder for logits output from + previous model evaluation. + prev_actions_logp (Placeholder): Placeholder for logp output from previous model evaluation. vf_preds (Placeholder): Placeholder for value function output from previous model evaluation. @@ -70,11 +74,9 @@ def __init__(self, def reduce_mean_valid(t): return tf.reduce_mean(tf.boolean_mask(t, valid_mask)) - dist_cls, _ = ModelCatalog.get_action_dist(action_space, {}) - prev_dist = dist_cls(logits) + prev_dist = dist_cls(prev_logits) # Make loss functions. - logp_ratio = tf.exp( - curr_action_dist.logp(actions) - prev_dist.logp(actions)) + logp_ratio = tf.exp(curr_action_dist.logp(actions) - prev_actions_logp) action_kl = prev_dist.kl(curr_action_dist) self.mean_kl = reduce_mean_valid(action_kl) @@ -116,10 +118,12 @@ def ppo_surrogate_loss(policy, batch_tensors): policy.loss_obj = PPOLoss( policy.action_space, + policy.dist_class, batch_tensors[Postprocessing.VALUE_TARGETS], batch_tensors[Postprocessing.ADVANTAGES], batch_tensors[SampleBatch.ACTIONS], batch_tensors[BEHAVIOUR_LOGITS], + batch_tensors[ACTION_PROB], batch_tensors[SampleBatch.VF_PREDS], policy.action_dist, policy.value_function, diff --git a/python/ray/rllib/examples/autoregressive_action_dist.py b/python/ray/rllib/examples/autoregressive_action_dist.py new file mode 100644 index 0000000000000..4567dd561d474 --- /dev/null +++ b/python/ray/rllib/examples/autoregressive_action_dist.py @@ -0,0 +1,202 @@ +"""Example of specifying an autoregressive action distribution.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import gym +from gym.spaces import Discrete, Tuple +import argparse +import random + +import ray +from ray import tune +from ray.rllib.models import ModelCatalog +from ray.rllib.models.tf.tf_action_dist import Categorical, ActionDistribution +from ray.rllib.models.tf.misc import normc_initializer +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.policy.policy import TupleActions +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() + +parser = argparse.ArgumentParser() +parser.add_argument("--run", type=str, default="PPO") +parser.add_argument("--stop", type=int, default=200) + + +class EmitCorrelatedActionsEnv(gym.Env): + """Simple env in which the policy has to emit a tuple of equal actions. + + However, the policy is penalized for emitting the exact same action each + time, so it has to learn P(a1) ~ random, then a2 = a1. + + The best score would be ~200 reward.""" + + def __init__(self, _): + self.observation_space = Discrete(2) + self.action_space = Tuple([Discrete(2), Discrete(2)]) + + def reset(self): + self.t = 0 + self.last = random.choice([0, 1]) + return self.last + + def step(self, action): + self.t += 1 + a1, a2 = action + reward = 0 + if a1 == self.last: + reward += 5 + # encourage correlation between a1 and a2 + if a1 == a2: + reward += 5 + done = self.t > 20 + self.last = random.choice([0, 1]) + return self.last, reward, done, {} + + +def make_binary_autoregressive_output(action_model): + """Returns an autoregressive ActionDistribution class for two outputs. + + Arguments: + action_model: Keras model that takes [context, a1_sample] and returns + logits for a1 and a2. + """ + + class AutoregressiveOutput(ActionDistribution): + def sample(self): + a1_dist = self._a1_distribution() + a1 = a1_dist.sample() + a2_dist = self._a2_distribution(a1) + a2 = a2_dist.sample() + self._action_prob = a1_dist.logp(a1) + a2_dist.logp(a2) + return TupleActions([a1, a2]) + + def sampled_action_prob(self): + return self._action_prob + + def logp(self, actions): + a1, a2 = actions[:, 0], actions[:, 1] + a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) + a1_logits, a2_logits = action_model([self.inputs, a1_vec]) + return (Categorical(a1_logits).logp(a1) + + Categorical(a2_logits).logp(a2)) + + def entropy(self): + a1_dist = self._a1_distribution() + a2_dist = self._a2_distribution(a1_dist.sample()) + return a1_dist.entropy() + a2_dist.entropy() + + def kl(self, other): + # TODO: implement this properly + return tf.zeros_like(self.entropy()) + + def _a1_distribution(self): + BATCH = tf.shape(self.inputs)[0] + a1_logits, _ = action_model([self.inputs, tf.zeros((BATCH, 1))]) + a1_dist = Categorical(a1_logits) + return a1_dist + + def _a2_distribution(self, a1): + a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) + _, a2_logits = action_model([self.inputs, a1_vec]) + a2_dist = Categorical(a2_logits) + return a2_dist + + return AutoregressiveOutput + + +class AutoregressiveActionsModel(TFModelV2): + """Custom autoregressive model for policy gradient algorithms.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + super(AutoregressiveActionsModel, self).__init__( + obs_space, action_space, num_outputs, model_config, name) + # Inputs + obs_input = tf.keras.layers.Input( + shape=obs_space.shape, name="obs_input") + a1_input = tf.keras.layers.Input(shape=(1, ), name="a1_input") + # TODO(ekl) the context should be allowed to have a different size from + # num_outputs. This currently doesn't work since RLlib checks the + # model output size is equal to num_outputs. + ctx_input = tf.keras.layers.Input( + shape=(num_outputs, ), name="ctx_input") + + # Shared hidden layer + context = tf.keras.layers.Dense( + num_outputs, + name="hidden", + activation=tf.nn.tanh, + kernel_initializer=normc_initializer(1.0))(obs_input) + + # V(s) + value_out = tf.keras.layers.Dense( + 1, + name="value_out", + activation=None, + kernel_initializer=normc_initializer(0.01))(context) + + # P(a1) + a1_logits = tf.keras.layers.Dense( + 2, + name="a1_logits", + activation=None, + kernel_initializer=normc_initializer(0.01))(ctx_input) + + # P(a2 | a1) -- note this doesn't include obs for example purposes, + # which forces the model to learn a2 without knowing the obs. In + # practice you'll want to use a Concat layer here so that a2 can be + # conditioned on both the obs and a1. + a2_hidden = tf.keras.layers.Dense( + 16, + name="a2_hidden", + activation=tf.nn.tanh, + kernel_initializer=normc_initializer(1.0))(a1_input) + a2_logits = tf.keras.layers.Dense( + 2, + name="a2_logits", + activation=None, + kernel_initializer=normc_initializer(0.01))(a2_hidden) + + # Base layers + self.base_model = tf.keras.Model(obs_input, [context, value_out]) + self.register_variables(self.base_model.variables) + self.base_model.summary() + + # Autoregressive action sampler + self.action_model = tf.keras.Model([ctx_input, a1_input], + [a1_logits, a2_logits]) + self.action_model.summary() + self.register_variables(self.action_model.variables) + + def forward(self, input_dict, state, seq_lens): + context, self._value_out = self.base_model(input_dict["obs"]) + return context, state + + def value_function(self): + return tf.reshape(self._value_out, [-1]) + + def override_action_distribution(self): + return make_binary_autoregressive_output(self.action_model) + + +if __name__ == "__main__": + ray.init() + args = parser.parse_args() + ModelCatalog.register_custom_model("autoregressive_model", + AutoregressiveActionsModel) + tune.run( + args.run, + stop={"episode_reward_mean": args.stop}, + config={ + "env": EmitCorrelatedActionsEnv, + "train_batch_size": 10, + "sample_batch_size": 10, + "gamma": 0.5, + "num_gpus": 0, + "model": { + "custom_model": "autoregressive_model", + }, + }) diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index 78ff5bdf925c3..87ec3ed971410 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -22,6 +22,11 @@ def sample(self): """Draw a sample from the action distribution.""" raise NotImplementedError + @DeveloperAPI + def sampled_action_prob(self): + """Returns the log probability of the last sampled action.""" + raise NotImplementedError + @DeveloperAPI def logp(self, x): """The log-likelihood of the action distribution.""" diff --git a/python/ray/rllib/models/modelv2.py b/python/ray/rllib/models/modelv2.py index f04bb457427b6..42691ff42a3e7 100644 --- a/python/ray/rllib/models/modelv2.py +++ b/python/ray/rllib/models/modelv2.py @@ -166,3 +166,6 @@ def __call__(self, input_dict, state=None, seq_lens=None): raise ValueError("State output is not a list: {}".format(state)) return outputs, state + + def override_action_distribution(self): + return None diff --git a/python/ray/rllib/models/tf/tf_action_dist.py b/python/ray/rllib/models/tf/tf_action_dist.py index 530d4bdddb82f..eb48871c218b8 100644 --- a/python/ray/rllib/models/tf/tf_action_dist.py +++ b/python/ray/rllib/models/tf/tf_action_dist.py @@ -30,12 +30,12 @@ def _build_sample_op(self): """ raise NotImplementedError - @DeveloperAPI + @override(ActionDistribution) def sample(self): """Draw a sample from the action distribution.""" return self.sample_op - @DeveloperAPI + @override(ActionDistribution) def sampled_action_prob(self): """Returns the log probability of the sampled action.""" return tf.exp(self.logp(self.sample_op)) diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py index d7a68c064ef13..5c91422055055 100644 --- a/python/ray/rllib/policy/dynamic_tf_policy.py +++ b/python/ray/rllib/policy/dynamic_tf_policy.py @@ -142,6 +142,15 @@ def __init__(self, logit_dim, self.config["model"], framework="tf") + + override_dist = self.model.override_action_distribution() + if override_dist is not None: + if action_sampler_fn: + raise ValueError( + "this policy doesn't use action dist classes for " + "sampling actions, so you cannot override it") + self.dist_class = override_dist + if existing_inputs: self.state_in = [ v for k, v in existing_inputs.items() diff --git a/python/ray/rllib/policy/tf_policy.py b/python/ray/rllib/policy/tf_policy.py index 160fc9cc74869..72f11da3fe83d 100644 --- a/python/ray/rllib/policy/tf_policy.py +++ b/python/ray/rllib/policy/tf_policy.py @@ -22,6 +22,8 @@ tf = try_import_tf() logger = logging.getLogger(__name__) +ACTION_PROB = "action_prob" + @DeveloperAPI class TFPolicy(Policy): @@ -303,7 +305,7 @@ def extra_compute_action_fetches(self): By default we only return action probability info (if present). """ if self._action_prob is not None: - return {"action_prob": self._action_prob} + return {ACTION_PROB: self._action_prob} else: return {} From 515f7ced8520cf6cfdb24df4def25f0284350ee1 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 28 Jul 2019 21:25:00 -0700 Subject: [PATCH 02/33] wip --- .../ray/rllib/agents/a3c/a3c_torch_policy.py | 2 +- python/ray/rllib/agents/ars/policies.py | 2 +- python/ray/rllib/agents/es/policies.py | 2 +- python/ray/rllib/agents/impala/vtrace.py | 22 +++-- .../ray/rllib/agents/impala/vtrace_policy.py | 3 + .../ray/rllib/agents/marwil/marwil_policy.py | 13 +-- python/ray/rllib/agents/pg/torch_pg_policy.py | 2 +- python/ray/rllib/agents/ppo/appo_policy.py | 6 +- python/ray/rllib/agents/ppo/ppo_policy.py | 15 +-- python/ray/rllib/agents/ppo/test/test.py | 2 +- python/ray/rllib/agents/qmix/qmix_policy.py | 2 +- .../examples/autoregressive_action_dist.py | 98 +++++++++---------- python/ray/rllib/examples/custom_loss.py | 2 +- .../ray/rllib/examples/custom_torch_policy.py | 2 +- python/ray/rllib/models/action_dist.py | 17 +++- python/ray/rllib/models/catalog.py | 4 +- python/ray/rllib/models/modelv2.py | 1 + python/ray/rllib/models/tf/tf_action_dist.py | 21 ++-- .../rllib/models/torch/torch_action_dist.py | 4 +- python/ray/rllib/policy/dynamic_tf_policy.py | 2 +- python/ray/rllib/policy/torch_policy.py | 8 +- 21 files changed, 127 insertions(+), 103 deletions(-) diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy.py b/python/ray/rllib/agents/a3c/a3c_torch_policy.py index 8045c397f27ea..014c6a44ef809 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy.py @@ -18,7 +18,7 @@ def actor_critic_loss(policy, batch_tensors): SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] }) # TODO(ekl) seq lens shouldn't be None values = policy.model.value_function() - dist = policy.dist_class(logits) + dist = policy.dist_class(logits, policy.model) log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS]) policy.entropy = dist.entropy().mean() policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot( diff --git a/python/ray/rllib/agents/ars/policies.py b/python/ray/rllib/agents/ars/policies.py index 7fdb54b99cd81..ce3e837a3fb94 100644 --- a/python/ray/rllib/agents/ars/policies.py +++ b/python/ray/rllib/agents/ars/policies.py @@ -81,7 +81,7 @@ def __init__(self, model = ModelCatalog.get_model({ "obs": self.inputs }, obs_space, action_space, dist_dim, model_config) - dist = dist_class(model.outputs) + dist = dist_class(model.outputs, model) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( diff --git a/python/ray/rllib/agents/es/policies.py b/python/ray/rllib/agents/es/policies.py index dfc7e2deec473..3ddb4dbeda9d8 100644 --- a/python/ray/rllib/agents/es/policies.py +++ b/python/ray/rllib/agents/es/policies.py @@ -59,7 +59,7 @@ def __init__(self, sess, action_space, obs_space, preprocessor, model = ModelCatalog.get_model({ "obs": self.inputs }, obs_space, action_space, dist_dim, model_options) - dist = dist_class(model.outputs) + dist = dist_class(model.outputs, model) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( diff --git a/python/ray/rllib/agents/impala/vtrace.py b/python/ray/rllib/agents/impala/vtrace.py index 08d49da241cd6..eaf984c373f39 100644 --- a/python/ray/rllib/agents/impala/vtrace.py +++ b/python/ray/rllib/agents/impala/vtrace.py @@ -49,13 +49,14 @@ def log_probs_from_logits_and_actions(policy_logits, actions, - dist_class=Categorical): + dist_class=Categorical, + model=None): return multi_log_probs_from_logits_and_actions([policy_logits], [actions], - dist_class)[0] + dist_class, model)[0] -def multi_log_probs_from_logits_and_actions(policy_logits, actions, - dist_class): +def multi_log_probs_from_logits_and_actions(policy_logits, actions, dist_class, + model): """Computes action log-probs from policy logits and actions. In the notation used throughout documentation and comments, T refers to the @@ -97,7 +98,7 @@ def multi_log_probs_from_logits_and_actions(policy_logits, actions, tf.concat([[-1], a_shape[2:]], axis=0)) log_probs.append( tf.reshape( - dist_class(policy_logits_flat).logp(actions_flat), + dist_class(policy_logits_flat, model).logp(actions_flat), a_shape[:2])) return log_probs @@ -111,6 +112,7 @@ def from_logits(behaviour_policy_logits, values, bootstrap_value, dist_class=Categorical, + model=None, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, name="vtrace_from_logits"): @@ -123,6 +125,7 @@ def from_logits(behaviour_policy_logits, values, bootstrap_value, dist_class, + model, clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, name=name) @@ -146,6 +149,7 @@ def multi_from_logits(behaviour_policy_logits, values, bootstrap_value, dist_class, + model, behaviour_action_prob=None, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, @@ -197,6 +201,7 @@ def multi_from_logits(behaviour_policy_logits, bootstrap_value: A float32 of shape [B] with the value function estimate at time T. dist_class: action distribution class for the logits. + model: backing ModelV2 instance behaviour_action_prob: precalculated values of the behaviour actions clip_rho_threshold: A scalar float32 tensor with the clipping threshold for importance weights (rho) when calculating the baseline targets (vs). @@ -237,11 +242,14 @@ def multi_from_logits(behaviour_policy_logits, discounts, rewards, values, bootstrap_value ]): target_action_log_probs = multi_log_probs_from_logits_and_actions( - target_policy_logits, actions, dist_class) + target_policy_logits, actions, dist_class, model) if len(behaviour_policy_logits) > 1 or behaviour_action_prob is None: + # can't use precalculated values, recompute them. Note that + # recomputing won't work well for autoregressive action dists + # which may have variables not captured by 'logits' behaviour_action_log_probs = ( multi_log_probs_from_logits_and_actions( - behaviour_policy_logits, actions, dist_class)) + behaviour_policy_logits, actions, dist_class, model)) else: behaviour_action_log_probs = tf.log(behaviour_action_prob) diff --git a/python/ray/rllib/agents/impala/vtrace_policy.py b/python/ray/rllib/agents/impala/vtrace_policy.py index 0f51c3f426f2d..514e66add8255 100644 --- a/python/ray/rllib/agents/impala/vtrace_policy.py +++ b/python/ray/rllib/agents/impala/vtrace_policy.py @@ -41,6 +41,7 @@ def __init__(self, values, bootstrap_value, dist_class, + model, valid_mask, vf_loss_coeff=0.5, entropy_coeff=0.01, @@ -88,6 +89,7 @@ def __init__(self, values=values, bootstrap_value=bootstrap_value, dist_class=dist_class, + model=model, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32)) @@ -201,6 +203,7 @@ def make_time_major(*args, **kw): values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], dist_class=Categorical if is_multidiscrete else policy.dist_class, + model=policy.model, valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=policy.config["vf_loss_coeff"], entropy_coeff=policy.entropy_coeff, diff --git a/python/ray/rllib/agents/marwil/marwil_policy.py b/python/ray/rllib/agents/marwil/marwil_policy.py index 47ff12ebdaca7..51208d24c4b60 100644 --- a/python/ray/rllib/agents/marwil/marwil_policy.py +++ b/python/ray/rllib/agents/marwil/marwil_policy.py @@ -29,7 +29,7 @@ def __init__(self, state_values, cumulative_rewards): class ReweightedImitationLoss(object): def __init__(self, state_values, cumulative_rewards, logits, actions, - action_space, beta): + action_space, beta, model): ma_adv_norm = tf.get_variable( name="moving_average_of_advantage_norm", dtype=tf.float32, @@ -48,8 +48,8 @@ def __init__(self, state_values, cumulative_rewards, logits, actions, beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm))) # log\pi_\theta(a|s) - dist_cls, _ = ModelCatalog.get_action_dist(action_space, {}) - action_dist = dist_cls(logits) + dist_class, _ = ModelCatalog.get_action_dist(action_space, {}) + action_dist = dist_class(logits, model) logprobs = action_dist.logp(actions) self.loss = -1.0 * tf.reduce_mean( @@ -84,7 +84,7 @@ def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config - dist_cls, logit_dim = ModelCatalog.get_action_dist( + dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs @@ -106,7 +106,7 @@ def __init__(self, observation_space, action_space, config): self.p_func_vars = scope_vars(scope.name) # Action outputs - action_dist = dist_cls(logits) + action_dist = dist_class(logits, self.model) self.output_actions = action_dist.sample() # Training inputs @@ -164,7 +164,8 @@ def _build_value_loss(self, state_values, cum_rwds): def _build_policy_loss(self, state_values, cum_rwds, logits, actions, action_space): return ReweightedImitationLoss(state_values, cum_rwds, logits, actions, - action_space, self.config["beta"]) + action_space, self.config["beta"], + self.model) @override(TFPolicy) def extra_compute_grad_fetches(self): diff --git a/python/ray/rllib/agents/pg/torch_pg_policy.py b/python/ray/rllib/agents/pg/torch_pg_policy.py index 442c57f48bfed..1e1fca7c40579 100644 --- a/python/ray/rllib/agents/pg/torch_pg_policy.py +++ b/python/ray/rllib/agents/pg/torch_pg_policy.py @@ -13,7 +13,7 @@ def pg_torch_loss(policy, batch_tensors): logits, _ = policy.model({ SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] }) - action_dist = policy.dist_class(logits) + action_dist = policy.dist_class(logits, policy.model) log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS]) # save the error in the policy object policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot( diff --git a/python/ray/rllib/agents/ppo/appo_policy.py b/python/ray/rllib/agents/ppo/appo_policy.py index 98b959b16254a..001f1b15167e6 100644 --- a/python/ray/rllib/agents/ppo/appo_policy.py +++ b/python/ray/rllib/agents/ppo/appo_policy.py @@ -90,6 +90,7 @@ def __init__(self, values, bootstrap_value, dist_class, + model, valid_mask, vf_loss_coeff=0.5, entropy_coeff=0.01, @@ -116,6 +117,7 @@ def __init__(self, values: A float32 tensor of shape [T, B]. bootstrap_value: A float32 tensor of shape [B]. dist_class: action distribution class for logits. + model: backing ModelV2 instance valid_mask: A bool tensor of valid RNN input elements (#2992). """ @@ -130,6 +132,7 @@ def __init__(self, values=values, bootstrap_value=bootstrap_value, dist_class=dist_class, + model=model, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32)) @@ -182,7 +185,7 @@ def make_time_major(*args, **kw): behaviour_logits, output_hidden_shape, axis=1) unpacked_outputs = tf.split(policy.model_out, output_hidden_shape, axis=1) action_dist = policy.action_dist - prev_action_dist = policy.dist_class(behaviour_logits) + prev_action_dist = policy.dist_class(behaviour_logits, policy.model) values = policy.value_function if policy.state_in: @@ -217,6 +220,7 @@ def make_time_major(*args, **kw): values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], dist_class=Categorical if is_multidiscrete else policy.dist_class, + model=policy.model, valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=policy.config["vf_loss_coeff"], entropy_coeff=policy.entropy_coeff, diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py index c7e0b04a5a89d..f72e9dfd321c2 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy.py +++ b/python/ray/rllib/agents/ppo/ppo_policy.py @@ -25,12 +25,13 @@ class PPOLoss(object): def __init__(self, action_space, - dist_cls, + dist_class, + model, value_targets, advantages, actions, prev_logits, - prev_actions_logp, + prev_actions_prob, vf_preds, curr_action_dist, value_fn, @@ -45,7 +46,7 @@ def __init__(self, Arguments: action_space: Environment observation space specification. - dist_cls: action distribution class for logits. + dist_class: action distribution class for logits. value_targets (Placeholder): Placeholder for target values; used for GAE. actions (Placeholder): Placeholder for actions taken @@ -54,7 +55,7 @@ def __init__(self, from previous model evaluation. prev_logits (Placeholder): Placeholder for logits output from previous model evaluation. - prev_actions_logp (Placeholder): Placeholder for logp output from + prev_actions_prob (Placeholder): Placeholder for prob output from previous model evaluation. vf_preds (Placeholder): Placeholder for value function output from previous model evaluation. @@ -74,9 +75,10 @@ def __init__(self, def reduce_mean_valid(t): return tf.reduce_mean(tf.boolean_mask(t, valid_mask)) - prev_dist = dist_cls(prev_logits) + prev_dist = dist_class(prev_logits, model) # Make loss functions. - logp_ratio = tf.exp(curr_action_dist.logp(actions) - prev_actions_logp) + logp_ratio = tf.exp( + curr_action_dist.logp(actions) - tf.log(prev_actions_prob)) action_kl = prev_dist.kl(curr_action_dist) self.mean_kl = reduce_mean_valid(action_kl) @@ -119,6 +121,7 @@ def ppo_surrogate_loss(policy, batch_tensors): policy.loss_obj = PPOLoss( policy.action_space, policy.dist_class, + policy.model, batch_tensors[Postprocessing.VALUE_TARGETS], batch_tensors[Postprocessing.ADVANTAGES], batch_tensors[SampleBatch.ACTIONS], diff --git a/python/ray/rllib/agents/ppo/test/test.py b/python/ray/rllib/agents/ppo/test/test.py index 978fe7c696c90..b0f3e0cc769c0 100644 --- a/python/ray/rllib/agents/ppo/test/test.py +++ b/python/ray/rllib/agents/ppo/test/test.py @@ -20,7 +20,7 @@ def testCategorical(self): logits = tf.placeholder(tf.float32, shape=(None, 10)) z = 8 * (np.random.rand(10) - 0.5) data = np.tile(z, (num_samples, 1)) - c = Categorical(logits) + c = Categorical(logits, None) sample_op = c.sample() sess = tf.Session() sess.run(tf.global_variables_initializer()) diff --git a/python/ray/rllib/agents/qmix/qmix_policy.py b/python/ray/rllib/agents/qmix/qmix_policy.py index 1b5c75c5d7989..0bf8347515772 100644 --- a/python/ray/rllib/agents/qmix/qmix_policy.py +++ b/python/ray/rllib/agents/qmix/qmix_policy.py @@ -246,7 +246,7 @@ def compute_actions(self, # epsilon-greedy action selector random_numbers = th.rand_like(q_values[:, :, 0]) pick_random = (random_numbers < self.cur_epsilon).long() - random_actions = Categorical(avail).sample().long() + random_actions = Categorical(avail, None).sample().long() actions = (pick_random * random_actions + (1 - pick_random) * masked_q_values.max(dim=2)[1]) actions = actions.numpy() diff --git a/python/ray/rllib/examples/autoregressive_action_dist.py b/python/ray/rllib/examples/autoregressive_action_dist.py index 4567dd561d474..19655ca0d4500 100644 --- a/python/ray/rllib/examples/autoregressive_action_dist.py +++ b/python/ray/rllib/examples/autoregressive_action_dist.py @@ -25,12 +25,9 @@ parser.add_argument("--stop", type=int, default=200) -class EmitCorrelatedActionsEnv(gym.Env): +class CorrelatedActionsEnv(gym.Env): """Simple env in which the policy has to emit a tuple of equal actions. - However, the policy is penalized for emitting the exact same action each - time, so it has to learn P(a1) ~ random, then a2 = a1. - The best score would be ~200 reward.""" def __init__(self, _): @@ -56,55 +53,48 @@ def step(self, action): return self.last, reward, done, {} -def make_binary_autoregressive_output(action_model): - """Returns an autoregressive ActionDistribution class for two outputs. - - Arguments: - action_model: Keras model that takes [context, a1_sample] and returns - logits for a1 and a2. - """ +class BinaryAutoregressiveOutput(ActionDistribution): + """An autoregressive ActionDistribution class for two outputs.""" - class AutoregressiveOutput(ActionDistribution): - def sample(self): - a1_dist = self._a1_distribution() - a1 = a1_dist.sample() - a2_dist = self._a2_distribution(a1) - a2 = a2_dist.sample() - self._action_prob = a1_dist.logp(a1) + a2_dist.logp(a2) - return TupleActions([a1, a2]) + def sample(self): + a1_dist = self._a1_distribution() + a1 = a1_dist.sample() + a2_dist = self._a2_distribution(a1) + a2 = a2_dist.sample() + self._action_prob = a1_dist.logp(a1) + a2_dist.logp(a2) + return TupleActions([a1, a2]) - def sampled_action_prob(self): - return self._action_prob + def sampled_action_prob(self): + return self._action_prob - def logp(self, actions): - a1, a2 = actions[:, 0], actions[:, 1] - a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) - a1_logits, a2_logits = action_model([self.inputs, a1_vec]) - return (Categorical(a1_logits).logp(a1) + - Categorical(a2_logits).logp(a2)) + def logp(self, actions): + a1, a2 = actions[:, 0], actions[:, 1] + a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) + a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec]) + return (Categorical(a1_logits, None).logp(a1) + Categorical( + a2_logits, None).logp(a2)) - def entropy(self): - a1_dist = self._a1_distribution() - a2_dist = self._a2_distribution(a1_dist.sample()) - return a1_dist.entropy() + a2_dist.entropy() + def entropy(self): + a1_dist = self._a1_distribution() + a2_dist = self._a2_distribution(a1_dist.sample()) + return a1_dist.entropy() + a2_dist.entropy() - def kl(self, other): - # TODO: implement this properly - return tf.zeros_like(self.entropy()) + def kl(self, other): + # TODO: implement this properly + return tf.zeros_like(self.entropy()) - def _a1_distribution(self): - BATCH = tf.shape(self.inputs)[0] - a1_logits, _ = action_model([self.inputs, tf.zeros((BATCH, 1))]) - a1_dist = Categorical(a1_logits) - return a1_dist + def _a1_distribution(self): + BATCH = tf.shape(self.inputs)[0] + a1_logits, _ = self.model.action_model( + [self.inputs, tf.zeros((BATCH, 1))]) + a1_dist = Categorical(a1_logits, None) + return a1_dist - def _a2_distribution(self, a1): - a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) - _, a2_logits = action_model([self.inputs, a1_vec]) - a2_dist = Categorical(a2_logits) - return a2_dist - - return AutoregressiveOutput + def _a2_distribution(self, a1): + a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) + _, a2_logits = self.model.action_model([self.inputs, a1_vec]) + a2_dist = Categorical(a2_logits, None) + return a2_dist class AutoregressiveActionsModel(TFModelV2): @@ -114,17 +104,19 @@ def __init__(self, obs_space, action_space, num_outputs, model_config, name): super(AutoregressiveActionsModel, self).__init__( obs_space, action_space, num_outputs, model_config, name) + if action_space != Tuple([Discrete(2), Discrete(2)]): + raise ValueError( + "This model only supports the [2, 2] action space") + # Inputs obs_input = tf.keras.layers.Input( shape=obs_space.shape, name="obs_input") a1_input = tf.keras.layers.Input(shape=(1, ), name="a1_input") - # TODO(ekl) the context should be allowed to have a different size from - # num_outputs. This currently doesn't work since RLlib checks the - # model output size is equal to num_outputs. ctx_input = tf.keras.layers.Input( shape=(num_outputs, ), name="ctx_input") - # Shared hidden layer + # Output of the model (normally 'logits', but for an autoregressive + # dist this is more like a context/feature layer encoding the obs) context = tf.keras.layers.Dense( num_outputs, name="hidden", @@ -179,7 +171,7 @@ def value_function(self): return tf.reshape(self._value_out, [-1]) def override_action_distribution(self): - return make_binary_autoregressive_output(self.action_model) + return BinaryAutoregressiveOutput if __name__ == "__main__": @@ -191,9 +183,7 @@ def override_action_distribution(self): args.run, stop={"episode_reward_mean": args.stop}, config={ - "env": EmitCorrelatedActionsEnv, - "train_batch_size": 10, - "sample_batch_size": 10, + "env": CorrelatedActionsEnv, "gamma": 0.5, "num_gpus": 0, "model": { diff --git a/python/ray/rllib/examples/custom_loss.py b/python/ray/rllib/examples/custom_loss.py index 16cc792724885..23bc900f09e10 100644 --- a/python/ray/rllib/examples/custom_loss.py +++ b/python/ray/rllib/examples/custom_loss.py @@ -67,7 +67,7 @@ def custom_loss(self, policy_loss, loss_inputs): print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # compute the IL loss - action_dist = Categorical(logits) + action_dist = Categorical(logits, None) self.policy_loss = policy_loss self.imitation_loss = tf.reduce_mean( -action_dist.logp(input_ops["actions"])) diff --git a/python/ray/rllib/examples/custom_torch_policy.py b/python/ray/rllib/examples/custom_torch_policy.py index e9b30876d7b42..4fdb3a064c381 100644 --- a/python/ray/rllib/examples/custom_torch_policy.py +++ b/python/ray/rllib/examples/custom_torch_policy.py @@ -18,7 +18,7 @@ def policy_gradient_loss(policy, batch_tensors): logits, _ = policy.model({ SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] }) - action_dist = policy.dist_class(logits) + action_dist = policy.dist_class(logits, policy.model) log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS]) return -batch_tensors[SampleBatch.REWARDS].dot(log_probs) diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index 87ec3ed971410..869e135750db3 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -9,13 +9,24 @@ class ActionDistribution(object): """The policy action distribution of an agent. - Args: - inputs (Tensor): The input vector to compute samples from. + Attributes: + inputs (Tensors): input vector to compute samples from. + model (ModelV2): reference to model producing the inputs. """ @DeveloperAPI - def __init__(self, inputs): + def __init__(self, inputs, model): + """Initialize the action dist. + + Arguments: + inputs (Tensors): input vector to compute samples from. + model (ModelV2): reference to model producing the inputs. This + is mainly useful if you want to use model variables to compute + action outputs (i.e., for auto-regressive action distributions, + see examples/autoregressive_action_dist.py). + """ self.inputs = inputs + self.model = model @DeveloperAPI def sample(self): diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py index 5c6b1cf67f547..8d7c1b0bbca8e 100644 --- a/python/ray/rllib/models/catalog.py +++ b/python/ray/rllib/models/catalog.py @@ -95,10 +95,10 @@ class ModelCatalog(object): >>> prep = ModelCatalog.get_preprocessor(env) >>> observation = prep.transform(raw_observation) - >>> dist_cls, dist_dim = ModelCatalog.get_action_dist( + >>> dist_class, dist_dim = ModelCatalog.get_action_dist( env.action_space, {}) >>> model = ModelCatalog.get_model(inputs, dist_dim, options) - >>> dist = dist_cls(model.outputs) + >>> dist = dist_class(model.outputs, model) >>> action = dist.sample() """ diff --git a/python/ray/rllib/models/modelv2.py b/python/ray/rllib/models/modelv2.py index 42691ff42a3e7..05d45e74306fa 100644 --- a/python/ray/rllib/models/modelv2.py +++ b/python/ray/rllib/models/modelv2.py @@ -168,4 +168,5 @@ def __call__(self, input_dict, state=None, seq_lens=None): return outputs, state def override_action_distribution(self): + """TODO(ekl) replace this with custom action distributions.""" return None diff --git a/python/ray/rllib/models/tf/tf_action_dist.py b/python/ray/rllib/models/tf/tf_action_dist.py index eb48871c218b8..1f69b448bd18a 100644 --- a/python/ray/rllib/models/tf/tf_action_dist.py +++ b/python/ray/rllib/models/tf/tf_action_dist.py @@ -17,8 +17,8 @@ class TFActionDistribution(ActionDistribution): """TF-specific extensions for building action distributions.""" @DeveloperAPI - def __init__(self, inputs): - super(TFActionDistribution, self).__init__(inputs) + def __init__(self, inputs, model): + super(TFActionDistribution, self).__init__(inputs, model) self.sample_op = self._build_sample_op() @DeveloperAPI @@ -80,9 +80,10 @@ def _build_sample_op(self): class MultiCategorical(TFActionDistribution): """Categorical distribution for discrete action spaces.""" - def __init__(self, inputs, input_lens): + def __init__(self, inputs, input_lens, model): + TFActionDistribution.__init__(self, inputs, model) self.cats = [ - Categorical(input_) + Categorical(input_, model) for input_ in tf.split(inputs, input_lens, axis=1) ] self.sample_op = self._build_sample_op() @@ -124,12 +125,12 @@ class DiagGaussian(TFActionDistribution): second half the gaussian standard deviations. """ - def __init__(self, inputs): + def __init__(self, inputs, model): mean, log_std = tf.split(inputs, 2, axis=1) self.mean = mean self.log_std = log_std self.std = tf.exp(log_std) - TFActionDistribution.__init__(self, inputs) + TFActionDistribution.__init__(self, inputs, model) @override(ActionDistribution) def logp(self, x): @@ -180,7 +181,9 @@ class MultiActionDistribution(TFActionDistribution): inputs (Tensor list): A list of tensors from which to compute samples. """ - def __init__(self, inputs, action_space, child_distributions, input_lens): + def __init__(self, inputs, model, action_space, child_distributions, + input_lens): + TFActionDistribution.__init__(self, inputs, model) self.input_lens = input_lens split_inputs = tf.split(inputs, self.input_lens, axis=1) child_list = [] @@ -241,7 +244,7 @@ class Dirichlet(TFActionDistribution): e.g. actions that represent resource allocation.""" - def __init__(self, inputs): + def __init__(self, inputs, model): """Input is a tensor of logits. The exponential of logits is used to parametrize the Dirichlet distribution as all parameters need to be positive. An arbitrary small epsilon is added to the concentration @@ -256,7 +259,7 @@ def __init__(self, inputs): validate_args=True, allow_nan_stats=False, ) - TFActionDistribution.__init__(self, concentration) + TFActionDistribution.__init__(self, concentration, model) @override(ActionDistribution) def logp(self, x): diff --git a/python/ray/rllib/models/torch/torch_action_dist.py b/python/ray/rllib/models/torch/torch_action_dist.py index b8becc9a30636..615fdcc5783f1 100644 --- a/python/ray/rllib/models/torch/torch_action_dist.py +++ b/python/ray/rllib/models/torch/torch_action_dist.py @@ -35,7 +35,7 @@ class TorchCategorical(TorchDistributionWrapper): """Wrapper class for PyTorch Categorical distribution.""" @override(ActionDistribution) - def __init__(self, inputs): + def __init__(self, inputs, model): self.dist = torch.distributions.categorical.Categorical(logits=inputs) @@ -43,7 +43,7 @@ class TorchDiagGaussian(TorchDistributionWrapper): """Wrapper class for PyTorch Normal distribution.""" @override(ActionDistribution) - def __init__(self, inputs): + def __init__(self, inputs, model): mean, log_std = torch.chunk(inputs, 2, dim=1) self.dist = torch.distributions.normal.Normal(mean, torch.exp(log_std)) diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py index 5c91422055055..3322907497926 100644 --- a/python/ray/rllib/policy/dynamic_tf_policy.py +++ b/python/ray/rllib/policy/dynamic_tf_policy.py @@ -173,7 +173,7 @@ def __init__(self, self, self.model, self.input_dict, obs_space, action_space, config) else: - self.action_dist = self.dist_class(self.model_out) + self.action_dist = self.dist_class(self.model_out, self.model) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() diff --git a/python/ray/rllib/policy/torch_policy.py b/python/ray/rllib/policy/torch_policy.py index 16eeb6a0a5825..aff405e332c5c 100644 --- a/python/ray/rllib/policy/torch_policy.py +++ b/python/ray/rllib/policy/torch_policy.py @@ -30,7 +30,7 @@ class TorchPolicy(Policy): """ def __init__(self, observation_space, action_space, model, loss, - action_distribution_cls): + action_distribution_class): """Build a policy from policy and loss torch modules. Note that model will be placed on GPU device if CUDA_VISIBLE_DEVICES @@ -44,7 +44,7 @@ def __init__(self, observation_space, action_space, model, loss, first item is action logits, and the rest can be any value. loss (func): Function that takes (policy, batch_tensors) and returns a single scalar loss. - action_distribution_cls (ActionDistribution): Class for action + action_distribution_class (ActionDistribution): Class for action distribution. """ self.observation_space = observation_space @@ -56,7 +56,7 @@ def __init__(self, observation_space, action_space, model, loss, self._model = model.to(self.device) self._loss = loss self._optimizer = self.optimizer() - self._action_dist_cls = action_distribution_cls + self._action_dist_class = action_distribution_class @override(Policy) def compute_actions(self, @@ -78,7 +78,7 @@ def compute_actions(self, input_dict["prev_rewards"] = prev_reward_batch model_out = self._model(input_dict, state_batches, [1]) logits, state = model_out - action_dist = self._action_dist_cls(logits) + action_dist = self._action_dist_class(logits, self._model) actions = action_dist.sample() return (actions.cpu().numpy(), [h.cpu().numpy() for h in state], From dee364fd1c6575a944240a532439246db9d60076 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 28 Jul 2019 21:45:09 -0700 Subject: [PATCH 03/33] fix --- ci/jenkins_tests/run_rllib_tests.sh | 3 +++ python/ray/rllib/examples/autoregressive_action_dist.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 7de03a6417858..9a6aef2461b56 100644 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -437,6 +437,9 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/examples/twostep_game.py --stop=2000 --run=APEX_QMIX +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/python/ray/rllib/examples/autoregressive_action_dist.py --stop=150 + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output /ray/python/ray/rllib/train.py \ --env PongDeterministic-v4 \ diff --git a/python/ray/rllib/examples/autoregressive_action_dist.py b/python/ray/rllib/examples/autoregressive_action_dist.py index 19655ca0d4500..b777a16e37719 100644 --- a/python/ray/rllib/examples/autoregressive_action_dist.py +++ b/python/ray/rllib/examples/autoregressive_action_dist.py @@ -65,7 +65,7 @@ def sample(self): return TupleActions([a1, a2]) def sampled_action_prob(self): - return self._action_prob + return tf.exp(self._action_prob) def logp(self, actions): a1, a2 = actions[:, 0], actions[:, 1] @@ -171,6 +171,7 @@ def value_function(self): return tf.reshape(self._value_out, [-1]) def override_action_distribution(self): + # TODO(ekl) remove this once we have custom action dists return BinaryAutoregressiveOutput From e2d4fcc7976671d3aa2056b496534ba3c209893d Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 28 Jul 2019 21:50:49 -0700 Subject: [PATCH 04/33] doc --- .../ray/rllib/examples/autoregressive_action_dist.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/python/ray/rllib/examples/autoregressive_action_dist.py b/python/ray/rllib/examples/autoregressive_action_dist.py index b777a16e37719..dc908e618c699 100644 --- a/python/ray/rllib/examples/autoregressive_action_dist.py +++ b/python/ray/rllib/examples/autoregressive_action_dist.py @@ -1,4 +1,14 @@ -"""Example of specifying an autoregressive action distribution.""" +"""Example of specifying an autoregressive action distribution. + +In an action space with multiple components (e.g., Tuple(a1, a2)), you might +want a2 to be sampled based on the sampled value of a1, i.e., +a2_sampled ~ P(a2 | a1_sampled, obs). Normally, a1 and a2 would be sampled +independently. + +To do this, you need both a custom model that implements the autoregressive +pattern, and a custom action distribution class that leverages that model. +This examples shows both. +""" from __future__ import absolute_import from __future__ import division From 3a51e24311d6077d2f489b9198a743579e09c6e4 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 28 Jul 2019 22:09:11 -0700 Subject: [PATCH 05/33] doc --- doc/source/rllib-components.svg | 5 +- doc/source/rllib-models.rst | 113 +++++++++++++++--- doc/source/rllib.rst | 1 + .../examples/autoregressive_action_dist.py | 17 ++- 4 files changed, 109 insertions(+), 27 deletions(-) diff --git a/doc/source/rllib-components.svg b/doc/source/rllib-components.svg index dac6268736d93..b9f7bbb115498 100644 --- a/doc/source/rllib-components.svg +++ b/doc/source/rllib-components.svg @@ -1,4 +1 @@ - - - - + \ No newline at end of file diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 295136842177b..f2af2e67492c5 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -233,24 +233,103 @@ Custom models can be used to work with environments where (1) the set of valid a Depending on your use case it may make sense to use just the masking, just action embeddings, or both. For a runnable example of this in code, check out `parametric_action_cartpole.py `__. Note that since masking introduces ``tf.float32.min`` values into the model output, this technique might not work with all algorithm options. For example, algorithms might crash if they incorrectly process the ``tf.float32.min`` values. The cartpole example has working configurations for DQN (must set ``hiddens=[]``), PPO (must disable running mean and set ``vf_share_layers=True``), and several other algorithms. -Model-Based Rollouts -~~~~~~~~~~~~~~~~~~~~ +Autoregressive Action Distributions +----------------------------------- -With a custom policy, you can also perform model-based rollouts and optionally incorporate the results of those rollouts as training data. For example, suppose you wanted to extend PGPolicy for model-based rollouts. This involves overriding the ``compute_actions`` method of that policy: +In an action space with multiple components (e.g., ``Tuple(a1, a2)``), you might want ``a2`` to be conditioned on the sampled value of ``a1``, i.e., ``a2_sampled ~ P(a2 | a1_sampled, obs)``. Normally, ``a1`` and ``a2`` would be sampled independently, reducing the expressivity of the policy. -.. code-block:: python - - class ModelBasedPolicy(PGPolicy): - def compute_actions(self, - obs_batch, - state_batches, - prev_action_batch=None, - prev_reward_batch=None, - episodes=None): - # compute a batch of actions based on the current obs_batch - # and state of each episode (i.e., for multiagent). You can do - # whatever is needed here, e.g., MCTS rollouts. - return action_batch +To do this, you need both a custom model that implements the autoregressive pattern, and a custom action distribution class that leverages that model. The `autoregressive_action_dist.py `__ example shows how this can be implemented for a simple binary action space. For a more complex space, a more efficient architecture such as a `MADE `__ is recommended. Note that sampling a `N-part` action requires `N` forward passes through the model, however computing the log probability of an action can be done in one pass: +.. code-block:: python -If you want take this rollouts data and append it to the sample batch, use the ``add_extra_batch()`` method of the `episode objects `__ passed in. For an example of this, see the ``testReturningModelBasedRolloutsData`` `unit test `__. + class BinaryAutoregressiveOutput(ActionDistribution): + """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)""" + + def sample(self): + # first, sample a1 + a1_dist = self._a1_distribution() + a1 = a1_dist.sample() + + # sample a2 conditioned on a1 + a2_dist = self._a2_distribution(a1) + a2 = a2_dist.sample() + self._action_prob = a1_dist.logp(a1) + a2_dist.logp(a2) + + # return the action tuple + return TupleActions([a1, a2]) + + def logp(self, actions): + a1, a2 = actions[:, 0], actions[:, 1] + a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) + a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec]) + return (Categorical(a1_logits, None).logp(a1) + Categorical( + a2_logits, None).logp(a2)) + + def _a1_distribution(self): + BATCH = tf.shape(self.inputs)[0] + a1_logits, _ = self.model.action_model( + [self.inputs, tf.zeros((BATCH, 1))]) + a1_dist = Categorical(a1_logits, None) + return a1_dist + + def _a2_distribution(self, a1): + a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) + _, a2_logits = self.model.action_model([self.inputs, a1_vec]) + a2_dist = Categorical(a2_logits, None) + return a2_dist + + class AutoregressiveActionsModel(TFModelV2): + """Implements the `.action_model` branch required above.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + super(AutoregressiveActionsModel, self).__init__( + obs_space, action_space, num_outputs, model_config, name) + if action_space != Tuple([Discrete(2), Discrete(2)]): + raise ValueError( + "This model only supports the [2, 2] action space") + + # Inputs + obs_input = tf.keras.layers.Input( + shape=obs_space.shape, name="obs_input") + a1_input = tf.keras.layers.Input(shape=(1, ), name="a1_input") + ctx_input = tf.keras.layers.Input( + shape=(num_outputs, ), name="ctx_input") + + # Output of the model (normally 'logits', but for an autoregressive + # dist this is more like a context/feature layer encoding the obs) + context = tf.keras.layers.Dense( + num_outputs, + name="hidden", + activation=tf.nn.tanh, + kernel_initializer=normc_initializer(1.0))(obs_input) + + # P(a1) + a1_logits = tf.keras.layers.Dense( + 2, + name="a1_logits", + activation=None, + kernel_initializer=normc_initializer(0.01))(ctx_input) + + # P(a2 | a1) + a2_hidden = tf.keras.layers.Dense( + 16, + name="a2_hidden", + activation=tf.nn.tanh, + kernel_initializer=normc_initializer(1.0))(a1_input) + a2_logits = tf.keras.layers.Dense( + 2, + name="a2_logits", + activation=None, + kernel_initializer=normc_initializer(0.01))(a2_hidden) + + # Base layers + self.base_model = tf.keras.Model(obs_input, context) + self.register_variables(self.base_model.variables) + self.base_model.summary() + + # Autoregressive action sampler + self.action_model = tf.keras.Model([ctx_input, a1_input], + [a1_logits, a2_logits]) + self.action_model.summary() + self.register_variables(self.action_model.variables) diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index df2d06dff94d9..cd04b226f4edb 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -49,6 +49,7 @@ Models and Preprocessors * `Custom Preprocessors `__ * `Supervised Model Losses `__ * `Variable-length / Parametric Action Spaces `__ +* `Autoregressive Action Distributions `__ Algorithms ---------- diff --git a/python/ray/rllib/examples/autoregressive_action_dist.py b/python/ray/rllib/examples/autoregressive_action_dist.py index dc908e618c699..283f0de9f8d48 100644 --- a/python/ray/rllib/examples/autoregressive_action_dist.py +++ b/python/ray/rllib/examples/autoregressive_action_dist.py @@ -31,7 +31,7 @@ tf = try_import_tf() parser = argparse.ArgumentParser() -parser.add_argument("--run", type=str, default="PPO") +parser.add_argument("--run", type=str, default="PPO") # try PG, PPO, IMPALA parser.add_argument("--stop", type=int, default=200) @@ -64,18 +64,20 @@ def step(self, action): class BinaryAutoregressiveOutput(ActionDistribution): - """An autoregressive ActionDistribution class for two outputs.""" + """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)""" def sample(self): + # first, sample a1 a1_dist = self._a1_distribution() a1 = a1_dist.sample() + + # sample a2 conditioned on a1 a2_dist = self._a2_distribution(a1) a2 = a2_dist.sample() self._action_prob = a1_dist.logp(a1) + a2_dist.logp(a2) - return TupleActions([a1, a2]) - def sampled_action_prob(self): - return tf.exp(self._action_prob) + # return the action tuple + return TupleActions([a1, a2]) def logp(self, actions): a1, a2 = actions[:, 0], actions[:, 1] @@ -84,6 +86,9 @@ def logp(self, actions): return (Categorical(a1_logits, None).logp(a1) + Categorical( a2_logits, None).logp(a2)) + def sampled_action_prob(self): + return tf.exp(self._action_prob) + def entropy(self): a1_dist = self._a1_distribution() a2_dist = self._a2_distribution(a1_dist.sample()) @@ -108,7 +113,7 @@ def _a2_distribution(self, a1): class AutoregressiveActionsModel(TFModelV2): - """Custom autoregressive model for policy gradient algorithms.""" + """Implements the `.action_model` branch required above.""" def __init__(self, obs_space, action_space, num_outputs, model_config, name): From 81c731fa6cd27e5c8973bf3002fe1b50c3de9f32 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 28 Jul 2019 23:41:59 -0700 Subject: [PATCH 06/33] Update dqn_policy.py --- python/ray/rllib/agents/dqn/dqn_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/rllib/agents/dqn/dqn_policy.py b/python/ray/rllib/agents/dqn/dqn_policy.py index 700c9085ac1ec..a017797cec154 100644 --- a/python/ray/rllib/agents/dqn/dqn_policy.py +++ b/python/ray/rllib/agents/dqn/dqn_policy.py @@ -109,7 +109,7 @@ class QValuePolicy(object): def __init__(self, q_values, observations, num_actions, stochastic, eps, softmax, softmax_temp): if softmax: - action_dist = Categorical(q_values / softmax_temp) + action_dist = Categorical(q_values / softmax_temp, None) self.action = action_dist.sample() self.action_prob = action_dist.sampled_action_prob() return From a9e5e146bfdbaf1c4b4e9a9aa63a41d9909d165d Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 28 Jul 2019 23:45:14 -0700 Subject: [PATCH 07/33] none --- python/ray/rllib/agents/ppo/test/test.py | 2 +- python/ray/rllib/agents/qmix/qmix_policy.py | 2 +- python/ray/rllib/examples/autoregressive_action_dist.py | 8 ++++---- python/ray/rllib/examples/custom_loss.py | 2 +- python/ray/rllib/models/tf/tf_action_dist.py | 4 ++++ 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/python/ray/rllib/agents/ppo/test/test.py b/python/ray/rllib/agents/ppo/test/test.py index b0f3e0cc769c0..978fe7c696c90 100644 --- a/python/ray/rllib/agents/ppo/test/test.py +++ b/python/ray/rllib/agents/ppo/test/test.py @@ -20,7 +20,7 @@ def testCategorical(self): logits = tf.placeholder(tf.float32, shape=(None, 10)) z = 8 * (np.random.rand(10) - 0.5) data = np.tile(z, (num_samples, 1)) - c = Categorical(logits, None) + c = Categorical(logits) sample_op = c.sample() sess = tf.Session() sess.run(tf.global_variables_initializer()) diff --git a/python/ray/rllib/agents/qmix/qmix_policy.py b/python/ray/rllib/agents/qmix/qmix_policy.py index 0bf8347515772..1b5c75c5d7989 100644 --- a/python/ray/rllib/agents/qmix/qmix_policy.py +++ b/python/ray/rllib/agents/qmix/qmix_policy.py @@ -246,7 +246,7 @@ def compute_actions(self, # epsilon-greedy action selector random_numbers = th.rand_like(q_values[:, :, 0]) pick_random = (random_numbers < self.cur_epsilon).long() - random_actions = Categorical(avail, None).sample().long() + random_actions = Categorical(avail).sample().long() actions = (pick_random * random_actions + (1 - pick_random) * masked_q_values.max(dim=2)[1]) actions = actions.numpy() diff --git a/python/ray/rllib/examples/autoregressive_action_dist.py b/python/ray/rllib/examples/autoregressive_action_dist.py index 283f0de9f8d48..939e986211c84 100644 --- a/python/ray/rllib/examples/autoregressive_action_dist.py +++ b/python/ray/rllib/examples/autoregressive_action_dist.py @@ -83,8 +83,8 @@ def logp(self, actions): a1, a2 = actions[:, 0], actions[:, 1] a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec]) - return (Categorical(a1_logits, None).logp(a1) + Categorical( - a2_logits, None).logp(a2)) + return (Categorical(a1_logits).logp(a1) + Categorical( + a2_logits).logp(a2)) def sampled_action_prob(self): return tf.exp(self._action_prob) @@ -102,13 +102,13 @@ def _a1_distribution(self): BATCH = tf.shape(self.inputs)[0] a1_logits, _ = self.model.action_model( [self.inputs, tf.zeros((BATCH, 1))]) - a1_dist = Categorical(a1_logits, None) + a1_dist = Categorical(a1_logits) return a1_dist def _a2_distribution(self, a1): a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) _, a2_logits = self.model.action_model([self.inputs, a1_vec]) - a2_dist = Categorical(a2_logits, None) + a2_dist = Categorical(a2_logits) return a2_dist diff --git a/python/ray/rllib/examples/custom_loss.py b/python/ray/rllib/examples/custom_loss.py index 23bc900f09e10..16cc792724885 100644 --- a/python/ray/rllib/examples/custom_loss.py +++ b/python/ray/rllib/examples/custom_loss.py @@ -67,7 +67,7 @@ def custom_loss(self, policy_loss, loss_inputs): print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # compute the IL loss - action_dist = Categorical(logits, None) + action_dist = Categorical(logits) self.policy_loss = policy_loss self.imitation_loss = tf.reduce_mean( -action_dist.logp(input_ops["actions"])) diff --git a/python/ray/rllib/models/tf/tf_action_dist.py b/python/ray/rllib/models/tf/tf_action_dist.py index 1f69b448bd18a..48d54ab8ab549 100644 --- a/python/ray/rllib/models/tf/tf_action_dist.py +++ b/python/ray/rllib/models/tf/tf_action_dist.py @@ -44,6 +44,10 @@ def sampled_action_prob(self): class Categorical(TFActionDistribution): """Categorical distribution for discrete action spaces.""" + @DeveloperAPI + def __init__(self, inputs, model=None): + super(Categorical, self).__init__(inputs, model) + @override(ActionDistribution) def logp(self, x): return -tf.nn.sparse_softmax_cross_entropy_with_logits( From 292d1ba795ad5cba1294784ce9f9c6cd87cf15fa Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 28 Jul 2019 23:45:30 -0700 Subject: [PATCH 08/33] lint --- python/ray/rllib/examples/autoregressive_action_dist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/rllib/examples/autoregressive_action_dist.py b/python/ray/rllib/examples/autoregressive_action_dist.py index 939e986211c84..7bf8c5e9a5c83 100644 --- a/python/ray/rllib/examples/autoregressive_action_dist.py +++ b/python/ray/rllib/examples/autoregressive_action_dist.py @@ -83,8 +83,8 @@ def logp(self, actions): a1, a2 = actions[:, 0], actions[:, 1] a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec]) - return (Categorical(a1_logits).logp(a1) + Categorical( - a2_logits).logp(a2)) + return ( + Categorical(a1_logits).logp(a1) + Categorical(a2_logits).logp(a2)) def sampled_action_prob(self): return tf.exp(self._action_prob) From 6e6059d01877fd04bd04361b404919688d7d02b4 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 29 Jul 2019 00:15:55 -0700 Subject: [PATCH 09/33] Update rllib-models.rst --- doc/source/rllib-models.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index f2af2e67492c5..4552153a5ef28 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -253,7 +253,6 @@ To do this, you need both a custom model that implements the autoregressive patt # sample a2 conditioned on a1 a2_dist = self._a2_distribution(a1) a2 = a2_dist.sample() - self._action_prob = a1_dist.logp(a1) + a2_dist.logp(a2) # return the action tuple return TupleActions([a1, a2]) From 368188e2b921185ec96fb1b1b0a85245e6d6943d Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 29 Jul 2019 00:52:55 -0700 Subject: [PATCH 10/33] docs update --- doc/source/rllib-models.rst | 8 ++++++-- .../rllib/examples/autoregressive_action_dist.py | 13 +++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index f2af2e67492c5..863107ec4902a 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -304,7 +304,7 @@ To do this, you need both a custom model that implements the autoregressive patt activation=tf.nn.tanh, kernel_initializer=normc_initializer(1.0))(obs_input) - # P(a1) + # P(a1 | obs) a1_logits = tf.keras.layers.Dense( 2, name="a1_logits", @@ -312,11 +312,15 @@ To do this, you need both a custom model that implements the autoregressive patt kernel_initializer=normc_initializer(0.01))(ctx_input) # P(a2 | a1) + # --note: typically you'd want to implement P(a2 | a1, obs) as follows: + # a2_context = tf.keras.layers.Concatenate(axis=1)( + # [ctx_input, a1_input]) + a2_context = a1_input a2_hidden = tf.keras.layers.Dense( 16, name="a2_hidden", activation=tf.nn.tanh, - kernel_initializer=normc_initializer(1.0))(a1_input) + kernel_initializer=normc_initializer(1.0))(a2_context) a2_logits = tf.keras.layers.Dense( 2, name="a2_logits", diff --git a/python/ray/rllib/examples/autoregressive_action_dist.py b/python/ray/rllib/examples/autoregressive_action_dist.py index 7bf8c5e9a5c83..74ebb05db5b59 100644 --- a/python/ray/rllib/examples/autoregressive_action_dist.py +++ b/python/ray/rllib/examples/autoregressive_action_dist.py @@ -145,22 +145,23 @@ def __init__(self, obs_space, action_space, num_outputs, model_config, activation=None, kernel_initializer=normc_initializer(0.01))(context) - # P(a1) + # P(a1 | obs) a1_logits = tf.keras.layers.Dense( 2, name="a1_logits", activation=None, kernel_initializer=normc_initializer(0.01))(ctx_input) - # P(a2 | a1) -- note this doesn't include obs for example purposes, - # which forces the model to learn a2 without knowing the obs. In - # practice you'll want to use a Concat layer here so that a2 can be - # conditioned on both the obs and a1. + # P(a2 | a1) + # --note: typically you'd want to implement P(a2 | a1, obs) as follows: + # a2_context = tf.keras.layers.Concatenate(axis=1)( + # [ctx_input, a1_input]) + a2_context = a1_input a2_hidden = tf.keras.layers.Dense( 16, name="a2_hidden", activation=tf.nn.tanh, - kernel_initializer=normc_initializer(1.0))(a1_input) + kernel_initializer=normc_initializer(1.0))(a2_context) a2_logits = tf.keras.layers.Dense( 2, name="a2_logits", From ca4cbbcc91684b695bb3627bdea74d1b8515f2b9 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 29 Jul 2019 02:16:39 -0700 Subject: [PATCH 11/33] doc update --- doc/source/rllib-env.rst | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index b04b91c3c2659..81df70cc6ad41 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -7,24 +7,26 @@ RLlib works with several different types of environments, including `OpenAI Gym **Compatibility matrix**: -============= ======================= ================== =========== ================== -Algorithm Discrete Actions Continuous Actions Multi-Agent Recurrent Policies -============= ======================= ================== =========== ================== -A2C, A3C **Yes** `+parametric`_ **Yes** **Yes** **Yes** -PPO, APPO **Yes** `+parametric`_ **Yes** **Yes** **Yes** -PG **Yes** `+parametric`_ **Yes** **Yes** **Yes** -IMPALA **Yes** `+parametric`_ **Yes** **Yes** **Yes** -DQN, Rainbow **Yes** `+parametric`_ No **Yes** No -DDPG, TD3 No **Yes** **Yes** No -APEX-DQN **Yes** `+parametric`_ No **Yes** No -APEX-DDPG No **Yes** **Yes** No -ES **Yes** **Yes** No No -ARS **Yes** **Yes** No No -QMIX **Yes** No **Yes** **Yes** -MARWIL **Yes** `+parametric`_ **Yes** **Yes** **Yes** -============= ======================= ================== =========== ================== +============= ======================= ================== =========== =========================== +Algorithm Discrete Actions Continuous Multi-Agent Policy Support +============= ======================= ================== =========== =========================== +A2C, A3C **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +PPO, APPO **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +PG **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +IMPALA **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +DQN, Rainbow **Yes** `+parametric`_ No **Yes** +DDPG, TD3 No **Yes** **Yes** +APEX-DQN **Yes** `+parametric`_ No **Yes** +APEX-DDPG No **Yes** **Yes** +ES **Yes** **Yes** No +ARS **Yes** **Yes** No +QMIX **Yes** No **Yes** `+RNN`_ +MARWIL **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ +============= ======================= ================== =========== =========================== .. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces +.. _`+RNN`: rllib-models.html#recurrent-models +.. _`+autoreg`: rllib-models.html#autoregressive-action-distributions You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name `__. Custom env classes passed directly to the trainer must take a single ``env_config`` parameter in their constructor: From b469b47c67ef0e2aa3c31a4503196def6dbb7e70 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 29 Jul 2019 02:21:18 -0700 Subject: [PATCH 12/33] move matrix --- doc/source/rllib-env.rst | 23 ----------------------- doc/source/rllib-models.rst | 23 +++++++++++++++++++++++ doc/source/rllib.rst | 1 + 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index 81df70cc6ad41..ba22ea906a71e 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -5,29 +5,6 @@ RLlib works with several different types of environments, including `OpenAI Gym .. image:: rllib-envs.svg -**Compatibility matrix**: - -============= ======================= ================== =========== =========================== -Algorithm Discrete Actions Continuous Multi-Agent Policy Support -============= ======================= ================== =========== =========================== -A2C, A3C **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ -PPO, APPO **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ -PG **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ -IMPALA **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ -DQN, Rainbow **Yes** `+parametric`_ No **Yes** -DDPG, TD3 No **Yes** **Yes** -APEX-DQN **Yes** `+parametric`_ No **Yes** -APEX-DDPG No **Yes** **Yes** -ES **Yes** **Yes** No -ARS **Yes** **Yes** No -QMIX **Yes** No **Yes** `+RNN`_ -MARWIL **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ -============= ======================= ================== =========== =========================== - -.. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces -.. _`+RNN`: rllib-models.html#recurrent-models -.. _`+autoreg`: rllib-models.html#autoregressive-action-distributions - You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name `__. Custom env classes passed directly to the trainer must take a single ``env_config`` parameter in their constructor: .. code-block:: python diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 86877bcc9965c..642d1137f8f62 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -7,6 +7,29 @@ The following diagram provides a conceptual overview of data flow between differ The components highlighted in green can be replaced with custom user-defined implementations, as described in the next sections. The purple components are RLlib internal, which means they can only be modified by changing the algorithm source code. +Feature Compatibility Matrix +---------------------------- + +============= ======================= ================== =========== =========================== +Algorithm Discrete Actions Continuous Multi-Agent Policy Support +============= ======================= ================== =========== =========================== +A2C, A3C **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +PPO, APPO **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +PG **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +IMPALA **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +DQN, Rainbow **Yes** `+parametric`_ No **Yes** +DDPG, TD3 No **Yes** **Yes** +APEX-DQN **Yes** `+parametric`_ No **Yes** +APEX-DDPG No **Yes** **Yes** +ES **Yes** **Yes** No +ARS **Yes** **Yes** No +QMIX **Yes** No **Yes** `+RNN`_ +MARWIL **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ +============= ======================= ================== =========== =========================== + +.. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces +.. _`+RNN`: rllib-models.html#recurrent-models +.. _`+autoreg`: rllib-models.html#autoregressive-action-distributions Default Behaviours ------------------ diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index cd04b226f4edb..16670b8da2d45 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -44,6 +44,7 @@ Environments Models and Preprocessors ------------------------ * `RLlib Models and Preprocessors Overview `__ +* `Feature Compatibility Matrix `__ * `TensorFlow Models `__ * `PyTorch Models `__ * `Custom Preprocessors `__ From a4e3069e3285fa83c91bd39ece7a24da9d21531d Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 29 Jul 2019 02:22:13 -0700 Subject: [PATCH 13/33] model --- doc/source/rllib-models.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 642d1137f8f62..e065a19055de0 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -11,7 +11,7 @@ Feature Compatibility Matrix ---------------------------- ============= ======================= ================== =========== =========================== -Algorithm Discrete Actions Continuous Multi-Agent Policy Support +Algorithm Discrete Actions Continuous Multi-Agent Model Support ============= ======================= ================== =========== =========================== A2C, A3C **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ PPO, APPO **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ From f5e5d0bb87ae33247a647934e24b8c5bdb793192 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 29 Jul 2019 02:23:26 -0700 Subject: [PATCH 14/33] env --- doc/source/rllib-env.rst | 24 ++++++++++++++++++++++++ doc/source/rllib-models.rst | 23 ----------------------- doc/source/rllib.rst | 2 +- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index ba22ea906a71e..d0fff81aba7e8 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -5,6 +5,30 @@ RLlib works with several different types of environments, including `OpenAI Gym .. image:: rllib-envs.svg +Feature Compatibility Matrix +---------------------------- + +============= ======================= ================== =========== =========================== +Algorithm Discrete Actions Continuous Multi-Agent Model Support +============= ======================= ================== =========== =========================== +A2C, A3C **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +PPO, APPO **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +PG **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +IMPALA **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ +DQN, Rainbow **Yes** `+parametric`_ No **Yes** +DDPG, TD3 No **Yes** **Yes** +APEX-DQN **Yes** `+parametric`_ No **Yes** +APEX-DDPG No **Yes** **Yes** +ES **Yes** **Yes** No +ARS **Yes** **Yes** No +QMIX **Yes** No **Yes** `+RNN`_ +MARWIL **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ +============= ======================= ================== =========== =========================== + +.. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces +.. _`+RNN`: rllib-models.html#recurrent-models +.. _`+autoreg`: rllib-models.html#autoregressive-action-distributions + You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name `__. Custom env classes passed directly to the trainer must take a single ``env_config`` parameter in their constructor: .. code-block:: python diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index e065a19055de0..86877bcc9965c 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -7,29 +7,6 @@ The following diagram provides a conceptual overview of data flow between differ The components highlighted in green can be replaced with custom user-defined implementations, as described in the next sections. The purple components are RLlib internal, which means they can only be modified by changing the algorithm source code. -Feature Compatibility Matrix ----------------------------- - -============= ======================= ================== =========== =========================== -Algorithm Discrete Actions Continuous Multi-Agent Model Support -============= ======================= ================== =========== =========================== -A2C, A3C **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ -PPO, APPO **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ -PG **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ -IMPALA **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+autoreg`_ -DQN, Rainbow **Yes** `+parametric`_ No **Yes** -DDPG, TD3 No **Yes** **Yes** -APEX-DQN **Yes** `+parametric`_ No **Yes** -APEX-DDPG No **Yes** **Yes** -ES **Yes** **Yes** No -ARS **Yes** **Yes** No -QMIX **Yes** No **Yes** `+RNN`_ -MARWIL **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ -============= ======================= ================== =========== =========================== - -.. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces -.. _`+RNN`: rllib-models.html#recurrent-models -.. _`+autoreg`: rllib-models.html#autoregressive-action-distributions Default Behaviours ------------------ diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 16670b8da2d45..f5f9a45e9664e 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -35,6 +35,7 @@ Training APIs Environments ------------ * `RLlib Environments Overview `__ +* `Feature Compatibility Matrix `__ * `OpenAI Gym `__ * `Vectorized `__ * `Multi-Agent and Hierarchical `__ @@ -44,7 +45,6 @@ Environments Models and Preprocessors ------------------------ * `RLlib Models and Preprocessors Overview `__ -* `Feature Compatibility Matrix `__ * `TensorFlow Models `__ * `PyTorch Models `__ * `Custom Preprocessors `__ From 2c34ebce6c721900ba482f01fc3bb32e429787da Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 29 Jul 2019 02:24:56 -0700 Subject: [PATCH 15/33] update --- doc/source/rllib-env.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index d0fff81aba7e8..1c2fa97527811 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -29,6 +29,9 @@ MARWIL **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_ .. _`+RNN`: rllib-models.html#recurrent-models .. _`+autoreg`: rllib-models.html#autoregressive-action-distributions +Configuring Environments +------------------------ + You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name `__. Custom env classes passed directly to the trainer must take a single ``env_config`` parameter in their constructor: .. code-block:: python @@ -71,9 +74,6 @@ For a full runnable code example using the custom environment API, see `custom_e The gym registry is not compatible with Ray. Instead, always use the registration flows documented above to ensure Ray workers can access the environment. -Configuring Environments ------------------------- - In the above example, note that the ``env_creator`` function takes in an ``env_config`` object. This is a dict containing options passed in through your trainer. You can also access ``env_config.worker_index`` and ``env_config.vector_index`` to get the worker id and env id within the worker (if ``num_envs_per_worker > 0``). This can be useful if you want to train over an ensemble of different environments, for example: .. code-block:: python From 67a2ae015e5b2aeb664f88cd623a9a2488442f78 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 5 Aug 2019 21:54:10 -0700 Subject: [PATCH 16/33] fix shuffle --- python/ray/rllib/optimizers/aso_multi_gpu_learner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/rllib/optimizers/aso_multi_gpu_learner.py b/python/ray/rllib/optimizers/aso_multi_gpu_learner.py index cbc8f61988c82..80109edfff934 100644 --- a/python/ray/rllib/optimizers/aso_multi_gpu_learner.py +++ b/python/ray/rllib/optimizers/aso_multi_gpu_learner.py @@ -165,7 +165,7 @@ def _step(self): opt = s.idle_optimizers.get() with self.load_timer: - tuples = s.policy._get_loss_inputs_dict(batch) + tuples = s.policy._get_loss_inputs_dict(batch, shuffle=False) data_keys = [ph for _, ph in s.policy._loss_inputs] if s.policy._state_inputs: state_keys = s.policy._state_inputs + [s.policy._seq_lens] From 1c7c0b329c26b10d9ab8f1d1cc80b86965d9718a Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 00:10:52 -0700 Subject: [PATCH 17/33] remove keras --- rllib/keras_policy.py | 65 ------------------------------------------- 1 file changed, 65 deletions(-) delete mode 100644 rllib/keras_policy.py diff --git a/rllib/keras_policy.py b/rllib/keras_policy.py deleted file mode 100644 index 3008e133c1c6d..0000000000000 --- a/rllib/keras_policy.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - -from ray.rllib.policy.policy import Policy - - -def _sample(probs): - return [np.random.choice(len(pr), p=pr) for pr in probs] - - -class KerasPolicy(Policy): - """Initialize the Keras Policy. - - This is a Policy used for models with actor and critics. - Note: This class is built for specific usage of Actor-Critic models, - and is less general compared to TFPolicy and TorchPolicies. - - Args: - observation_space (gym.Space): Observation space of the policy. - action_space (gym.Space): Action space of the policy. - config (dict): Policy-specific configuration data. - actor (Model): A model that holds the policy. - critic (Model): A model that holds the value function. - """ - - def __init__(self, - observation_space, - action_space, - config, - actor=None, - critic=None): - Policy.__init__(self, observation_space, action_space, config) - self.actor = actor - self.critic = critic - self.models = [self.actor, self.critic] - - def compute_actions(self, obs, *args, **kwargs): - state = np.array(obs) - policy = self.actor.predict(state) - value = self.critic.predict(state) - return _sample(policy), [], {"vf_preds": value.flatten()} - - def learn_on_batch(self, batch, *args): - self.actor.fit( - batch["obs"], - batch["adv_targets"], - epochs=1, - verbose=0, - steps_per_epoch=20) - self.critic.fit( - batch["obs"], - batch["value_targets"], - epochs=1, - verbose=0, - steps_per_epoch=20) - return {} - - def get_weights(self): - return [model.get_weights() for model in self.models] - - def set_weights(self, weights): - return [model.set_weights(w) for model, w in zip(self.models, weights)] From 19b91da0a8c4544c88592e08d678720776846c59 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 12:22:52 -0700 Subject: [PATCH 18/33] update docs --- doc/source/rllib-models.rst | 4 ++++ rllib/examples/autoregressive_action_dist.py | 11 +++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 57b5b452b0e92..0d9bde419e104 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -245,6 +245,10 @@ To do this, you need both a custom model that implements the autoregressive patt class BinaryAutoregressiveOutput(ActionDistribution): """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)""" + @staticmethod + def required_model_output_shape(self, model_config): + return 16 # controls model output feature vector size + def sample(self): # first, sample a1 a1_dist = self._a1_distribution() diff --git a/rllib/examples/autoregressive_action_dist.py b/rllib/examples/autoregressive_action_dist.py index 74ebb05db5b59..b9cff2d12b936 100644 --- a/rllib/examples/autoregressive_action_dist.py +++ b/rllib/examples/autoregressive_action_dist.py @@ -66,6 +66,10 @@ def step(self, action): class BinaryAutoregressiveOutput(ActionDistribution): """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)""" + @staticmethod + def required_model_output_shape(self, model_config): + return 16 # controls model output feature vector size + def sample(self): # first, sample a1 a1_dist = self._a1_distribution() @@ -186,16 +190,14 @@ def forward(self, input_dict, state, seq_lens): def value_function(self): return tf.reshape(self._value_out, [-1]) - def override_action_distribution(self): - # TODO(ekl) remove this once we have custom action dists - return BinaryAutoregressiveOutput - if __name__ == "__main__": ray.init() args = parser.parse_args() ModelCatalog.register_custom_model("autoregressive_model", AutoregressiveActionsModel) + ModelCatalog.register_custom_action_dist("binary_autoreg_output", + BinaryAutoregressiveOutput) tune.run( args.run, stop={"episode_reward_mean": args.stop}, @@ -205,5 +207,6 @@ def override_action_distribution(self): "num_gpus": 0, "model": { "custom_model": "autoregressive_model", + "custom_action_dist": "binary_autoreg_output", }, }) From 59a29f616388b8b5dd9e17d986cd6cf21277e13d Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 12:34:03 -0700 Subject: [PATCH 19/33] docs --- doc/source/rllib-models.rst | 39 +++++++++++++++++++++++++++++++++++-- doc/source/rllib.rst | 7 ++++--- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 0d9bde419e104..c3799c9105f37 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -1,5 +1,5 @@ -RLlib Models and Preprocessors -============================== +RLlib Models, Preprocessors, and Action Distributions +===================================================== The following diagram provides a conceptual overview of data flow between different components in RLlib. We start with an ``Environment``, which given an action produces an observation. The observation is preprocessed by a ``Preprocessor`` and ``Filter`` (e.g. for running mean normalization) before being sent to a neural network ``Model``. The model output is in turn interpreted by an ``ActionDistribution`` to determine the next action. @@ -145,6 +145,7 @@ Custom preprocessors should subclass the RLlib `preprocessor class `__. + +.. code-block:: python + + import ray + import ray.rllib.agents.ppo as ppo + from ray.rllib.models import ModelCatalog + from ray.rllib.models.preprocessors import Preprocessor + + class MyActionDist(ActionDistribution): + @staticmethod + def required_model_output_shape(action_space, model_config): + return 7 # controls model output feature vector size + + def __init__(self, inputs, model): + super(MyActionDist, self).__init__(inputs, model) + assert model.num_outputs == 7 + + def sample(self): ... + def logp(self, actions): ... + def entropy(self): ... + + ModelCatalog.register_custom_action_dist("my_dist", MyActionDist) + + ray.init() + trainer = ppo.PPOTrainer(env="CartPole-v0", config={ + "model": { + "custom_action_dist": "my_dist", + }, + }) + Supervised Model Losses ----------------------- diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index be184c8f12d2e..1ca14c3c6fbf2 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -42,12 +42,13 @@ Environments * `Interfacing with External Agents `__ * `Advanced Integrations `__ -Models and Preprocessors ------------------------- -* `RLlib Models and Preprocessors Overview `__ +Models, Preprocessors, and Action Distributions +----------------------------------------------- +* `RLlib Models, Preprocessors, and Action Distributions Overview `__ * `TensorFlow Models `__ * `PyTorch Models `__ * `Custom Preprocessors `__ +* `Custom Action Distributions `__ * `Supervised Model Losses `__ * `Variable-length / Parametric Action Spaces `__ * `Autoregressive Action Distributions `__ From b1ed8918d8d19d5fa437974fccc68ed5a3be3114 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 13:32:03 -0700 Subject: [PATCH 20/33] switch to logp for stability --- doc/source/rllib-concepts.rst | 2 +- doc/source/rllib-models.rst | 7 +++++-- rllib/agents/dqn/dqn_policy.py | 4 ++-- rllib/agents/dqn/simple_q_policy.py | 4 ++-- rllib/agents/impala/vtrace.py | 10 +++++----- rllib/agents/impala/vtrace_policy.py | 12 ++++++------ rllib/agents/marwil/marwil_policy.py | 2 +- rllib/agents/ppo/ppo_policy.py | 11 +++++------ rllib/examples/autoregressive_action_dist.py | 4 ++-- rllib/models/action_dist.py | 2 +- rllib/models/tf/tf_action_dist.py | 16 ++++++++-------- rllib/policy/dynamic_tf_policy.py | 8 ++++---- rllib/policy/tf_policy.py | 14 +++++++++----- 13 files changed, 51 insertions(+), 45 deletions(-) diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst index 32aed38514821..b58ceba8abc99 100644 --- a/doc/source/rllib-concepts.rst +++ b/doc/source/rllib-concepts.rst @@ -407,7 +407,7 @@ The action sampler is straightforward, it just takes the q_model, runs a forward config): # do max over Q values... ... - return action, action_prob + return action, action_logp The remainder of DQN is similar to other algorithms. Target updates are handled by a ``after_optimizer_step`` callback that periodically copies the weights of the Q network to the target. diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index c3799c9105f37..e6fbcc1170663 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -168,7 +168,7 @@ Custom preprocessors should subclass the RLlib `preprocessor class `__. +Similar to custom models and preprocessors, you can also specify a custom action distribution class as follows. The action dist class is passed a reference to the ``model``, which you can use to access ``model.model_config`` or other attributes of the model. This is commonly used to implement `autoregressive action outputs <#autoregressive-action-distributions>`__. Not all algorithms support custom action distributions; see the `feature compatibility matrix `__. .. code-block:: python @@ -266,7 +266,8 @@ Custom models can be used to work with environments where (1) the set of valid a return action_logits + inf_mask, state -Depending on your use case it may make sense to use just the masking, just action embeddings, or both. For a runnable example of this in code, check out `parametric_action_cartpole.py `__. Note that since masking introduces ``tf.float32.min`` values into the model output, this technique might not work with all algorithm options. For example, algorithms might crash if they incorrectly process the ``tf.float32.min`` values. The cartpole example has working configurations for DQN (must set ``hiddens=[]``), PPO (must disable running mean and set ``vf_share_layers=True``), and several other algorithms. +Depending on your use case it may make sense to use just the masking, just action embeddings, or both. For a runnable example of this in code, check out `parametric_action_cartpole.py `__. Note that since masking introduces ``tf.float32.min`` values into the model output, this technique might not work with all algorithm options. For example, algorithms might crash if they incorrectly process the ``tf.float32.min`` values. The cartpole example has working configurations for DQN (must set ``hiddens=[]``), PPO (must disable running mean and set ``vf_share_layers=True``), and several other algorithms. Not all algorithms support parametric actions; see the `feature compatibility matrix `__. + Autoregressive Action Distributions ----------------------------------- @@ -375,3 +376,5 @@ To do this, you need both a custom model that implements the autoregressive patt [a1_logits, a2_logits]) self.action_model.summary() self.register_variables(self.action_model.variables) + + Not all algorithms support custom action distributions; see the `feature compatibility matrix `__. diff --git a/rllib/agents/dqn/dqn_policy.py b/rllib/agents/dqn/dqn_policy.py index 46c891f7d7bed..5ee0dd1037374 100644 --- a/rllib/agents/dqn/dqn_policy.py +++ b/rllib/agents/dqn/dqn_policy.py @@ -112,7 +112,7 @@ def __init__(self, q_values, observations, num_actions, stochastic, eps, action_dist = Categorical( q_values / softmax_temp, model_config=model_config) self.action = action_dist.sample() - self.action_prob = action_dist.sampled_action_prob() + self.action_prob = tf.exp(action_dist.sampled_action_logp()) return deterministic_actions = tf.argmax(q_values, axis=1) @@ -260,7 +260,7 @@ def build_q_networks(policy, q_model, input_dict, obs_space, action_space, config["model"]) policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob - return policy.output_actions, policy.action_prob + return policy.output_actions, tf.log(policy.action_prob) def _build_parameter_noise(policy, pnet_params): diff --git a/rllib/agents/dqn/simple_q_policy.py b/rllib/agents/dqn/simple_q_policy.py index 0212fdef65242..44fd188533b22 100644 --- a/rllib/agents/dqn/simple_q_policy.py +++ b/rllib/agents/dqn/simple_q_policy.py @@ -128,9 +128,9 @@ def build_action_sampler(policy, q_model, input_dict, obs_space, action_space, deterministic_actions) action = tf.cond(policy.stochastic, lambda: stochastic_actions, lambda: deterministic_actions) - action_prob = None + action_logp = None - return action, action_prob + return action, action_logp def build_q_losses(policy, batch_tensors): diff --git a/rllib/agents/impala/vtrace.py b/rllib/agents/impala/vtrace.py index c4015a883c502..d2baa03fce2a0 100644 --- a/rllib/agents/impala/vtrace.py +++ b/rllib/agents/impala/vtrace.py @@ -155,7 +155,7 @@ def multi_from_logits(behaviour_policy_logits, config, dist_class, model, - behaviour_action_prob=None, + behaviour_action_log_probs=None, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, name="vtrace_from_logits"): @@ -207,7 +207,7 @@ def multi_from_logits(behaviour_policy_logits, time T. dist_class: action distribution class for the logits. model: backing ModelV2 instance - behaviour_action_prob: precalculated values of the behaviour actions + behaviour_action_log_probs: precalculated values of the behaviour actions clip_rho_threshold: A scalar float32 tensor with the clipping threshold for importance weights (rho) when calculating the baseline targets (vs). rho^bar in the paper. @@ -248,15 +248,15 @@ def multi_from_logits(behaviour_policy_logits, ]): target_action_log_probs = multi_log_probs_from_logits_and_actions( target_policy_logits, actions, dist_class, model) - if len(behaviour_policy_logits) > 1 or behaviour_action_prob is None: + + if (len(behaviour_policy_logits) > 1 + or behaviour_action_log_probs is None): # can't use precalculated values, recompute them. Note that # recomputing won't work well for autoregressive action dists # which may have variables not captured by 'logits' behaviour_action_log_probs = ( multi_log_probs_from_logits_and_actions( behaviour_policy_logits, actions, dist_class, model)) - else: - behaviour_action_log_probs = tf.log(behaviour_action_prob) log_rhos = get_log_rhos(target_action_log_probs, behaviour_action_log_probs) diff --git a/rllib/agents/impala/vtrace_policy.py b/rllib/agents/impala/vtrace_policy.py index 3a758a38dd491..cc24f8c452a87 100644 --- a/rllib/agents/impala/vtrace_policy.py +++ b/rllib/agents/impala/vtrace_policy.py @@ -33,7 +33,7 @@ def __init__(self, actions_logp, actions_entropy, dones, - behaviour_action_prob, + behaviour_action_logp, behaviour_logits, target_logits, discount, @@ -59,7 +59,7 @@ def __init__(self, actions_logp: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. dones: A bool tensor of shape [T, B]. - behaviour_action_prob: Tensor of shape [T, B]. + behaviour_action_logp: Tensor of shape [T, B]. behaviour_logits: A list with length of ACTION_SPACE of float32 tensors of shapes [T, B, ACTION_SPACE[0]], @@ -82,7 +82,7 @@ def __init__(self, # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( - behaviour_action_prob=behaviour_action_prob, + behaviour_action_logp=behaviour_action_logp, behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, actions=tf.unstack(actions, axis=2), @@ -169,7 +169,7 @@ def make_time_major(*args, **kw): actions = batch_tensors[SampleBatch.ACTIONS] dones = batch_tensors[SampleBatch.DONES] rewards = batch_tensors[SampleBatch.REWARDS] - behaviour_action_prob = batch_tensors[ACTION_PROB] + behaviour_action_logp = batch_tensors[ACTION_PROB] behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS] unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) @@ -196,8 +196,8 @@ def make_time_major(*args, **kw): actions_entropy=make_time_major( action_dist.multi_entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), - behaviour_action_prob=make_time_major( - behaviour_action_prob, drop_last=True), + behaviour_action_logp=make_time_major( + behaviour_action_logp, drop_last=True), behaviour_logits=make_time_major( unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), diff --git a/rllib/agents/marwil/marwil_policy.py b/rllib/agents/marwil/marwil_policy.py index 51208d24c4b60..72b8a239383b1 100644 --- a/rllib/agents/marwil/marwil_policy.py +++ b/rllib/agents/marwil/marwil_policy.py @@ -141,7 +141,7 @@ def __init__(self, observation_space, action_space, config): self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, - action_prob=action_dist.sampled_action_prob(), + action_logp=action_dist.sampled_action_logp(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, diff --git a/rllib/agents/ppo/ppo_policy.py b/rllib/agents/ppo/ppo_policy.py index 9e5771efededf..60d05a5a68029 100644 --- a/rllib/agents/ppo/ppo_policy.py +++ b/rllib/agents/ppo/ppo_policy.py @@ -9,7 +9,7 @@ Postprocessing from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy import LearningRateSchedule, \ - EntropyCoeffSchedule, ACTION_PROB + EntropyCoeffSchedule, ACTION_LOGP from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.utils.explained_variance import explained_variance from ray.rllib.utils import try_import_tf @@ -31,7 +31,7 @@ def __init__(self, advantages, actions, prev_logits, - prev_actions_prob, + prev_actions_logp, vf_preds, curr_action_dist, value_fn, @@ -56,7 +56,7 @@ def __init__(self, from previous model evaluation. prev_logits (Placeholder): Placeholder for logits output from previous model evaluation. - prev_actions_prob (Placeholder): Placeholder for prob output from + prev_actions_logp (Placeholder): Placeholder for prob output from previous model evaluation. vf_preds (Placeholder): Placeholder for value function output from previous model evaluation. @@ -80,8 +80,7 @@ def reduce_mean_valid(t): prev_dist = dist_class(prev_logits, model) # Make loss functions. - logp_ratio = tf.exp( - curr_action_dist.logp(actions) - tf.log(prev_actions_prob)) + logp_ratio = tf.exp(curr_action_dist.logp(actions) - prev_actions_logp) action_kl = prev_dist.kl(curr_action_dist) self.mean_kl = reduce_mean_valid(action_kl) @@ -129,7 +128,7 @@ def ppo_surrogate_loss(policy, batch_tensors): batch_tensors[Postprocessing.ADVANTAGES], batch_tensors[SampleBatch.ACTIONS], batch_tensors[BEHAVIOUR_LOGITS], - batch_tensors[ACTION_PROB], + batch_tensors[ACTION_LOGP], batch_tensors[SampleBatch.VF_PREDS], policy.action_dist, policy.value_function, diff --git a/rllib/examples/autoregressive_action_dist.py b/rllib/examples/autoregressive_action_dist.py index b9cff2d12b936..40296afba460d 100644 --- a/rllib/examples/autoregressive_action_dist.py +++ b/rllib/examples/autoregressive_action_dist.py @@ -90,8 +90,8 @@ def logp(self, actions): return ( Categorical(a1_logits).logp(a1) + Categorical(a2_logits).logp(a2)) - def sampled_action_prob(self): - return tf.exp(self._action_prob) + def sampled_action_logp(self): + return tf.exp(self._action_logp) def entropy(self): a1_dist = self._a1_distribution() diff --git a/rllib/models/action_dist.py b/rllib/models/action_dist.py index c6e1f43c6b7a4..f5a5f1e3c0cf2 100644 --- a/rllib/models/action_dist.py +++ b/rllib/models/action_dist.py @@ -34,7 +34,7 @@ def sample(self): raise NotImplementedError @DeveloperAPI - def sampled_action_prob(self): + def sampled_action_logp(self): """Returns the log probability of the last sampled action.""" raise NotImplementedError diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index a095d2b3c0cd0..95cb5340b1d6c 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -26,7 +26,7 @@ def _build_sample_op(self): """Implement this instead of sample(), to enable op reuse. This is needed since the sample op is non-deterministic and is shared - between sample() and sampled_action_prob(). + between sample() and sampled_action_logp(). """ raise NotImplementedError @@ -36,9 +36,9 @@ def sample(self): return self.sample_op @override(ActionDistribution) - def sampled_action_prob(self): + def sampled_action_logp(self): """Returns the log probability of the sampled action.""" - return tf.exp(self.logp(self.sample_op)) + return self.logp(self.sample_op) class Categorical(TFActionDistribution): @@ -185,8 +185,8 @@ class Deterministic(TFActionDistribution): """ @override(TFActionDistribution) - def sampled_action_prob(self): - return 1.0 + def sampled_action_logp(self): + return np.log(1.0) @override(TFActionDistribution) def _build_sample_op(self): @@ -255,10 +255,10 @@ def sample(self): return TupleActions([s.sample() for s in self.child_distributions]) @override(TFActionDistribution) - def sampled_action_prob(self): - p = self.child_distributions[0].sampled_action_prob() + def sampled_action_logp(self): + p = self.child_distributions[0].sampled_action_logp() for c in self.child_distributions[1:]: - p *= c.sampled_action_prob() + p += c.sampled_action_logp() return p diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index cfdb6f3e742ed..c3ada9432338a 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -66,7 +66,7 @@ def __init__(self, All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (func): optional function that returns a - tuple of action and action prob tensors given + tuple of action and action logp tensors given (policy, model, input_dict, obs_space, action_space, config). If not specified, a default action distribution will be used. existing_inputs (OrderedDict): when copying a policy, this @@ -171,13 +171,13 @@ def __init__(self, # Setup action sampler if action_sampler_fn: self.action_dist = None - action_sampler, action_prob = action_sampler_fn( + action_sampler, action_logp = action_sampler_fn( self, self.model, self.input_dict, obs_space, action_space, config) else: self.action_dist = self.dist_class(self.model_out, self.model) action_sampler = self.action_dist.sample() - action_prob = self.action_dist.sampled_action_prob() + action_logp = self.action_dist.sampled_action_logp() # Phase 1 init sess = tf.get_default_session() or tf.Session() @@ -192,7 +192,7 @@ def __init__(self, sess, obs_input=obs, action_sampler=action_sampler, - action_prob=action_prob, + action_logp=action_logp, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index 49e4ad665ce9a..0232a67a25118 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -23,6 +23,7 @@ logger = logging.getLogger(__name__) ACTION_PROB = "action_prob" +ACTION_LOGP = "action_logp" @DeveloperAPI @@ -61,7 +62,7 @@ def __init__(self, loss, loss_inputs, model=None, - action_prob=None, + action_logp=None, state_inputs=None, state_outputs=None, prev_action_input=None, @@ -89,7 +90,7 @@ def __init__(self, placeholders during loss computation. model (rllib.models.Model): used to integrate custom losses and stats from user-defined RLlib models. - action_prob (Tensor): probability of the sampled action. + action_logp (Tensor): probability of the sampled action. state_inputs (list): list of RNN state input Tensors. state_outputs (list): list of RNN state output Tensors. prev_action_input (Tensor): placeholder for previous actions @@ -115,7 +116,7 @@ def __init__(self, self._prev_reward_input = prev_reward_input self._sampler = action_sampler self._is_training = self._get_is_training_placeholder() - self._action_prob = action_prob + self._action_logp = action_logp self._state_inputs = state_inputs or [] self._state_outputs = state_outputs or [] self._seq_lens = seq_lens @@ -304,8 +305,11 @@ def extra_compute_action_fetches(self): By default we only return action probability info (if present). """ - if self._action_prob is not None: - return {ACTION_PROB: self._action_prob} + if self._action_logp is not None: + return { + ACTION_PROB: tf.exp(self._action_logp), + ACTION_LOGP: self._action_logp, + } else: return {} From 6e584a3841b61a916ceaa26c7dad6b4abfd2e18b Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 13:44:21 -0700 Subject: [PATCH 21/33] remove override --- rllib/agents/impala/vtrace_policy.py | 6 +++--- rllib/examples/autoregressive_action_dist.py | 2 +- rllib/models/modelv2.py | 4 ---- rllib/policy/dynamic_tf_policy.py | 8 -------- 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/rllib/agents/impala/vtrace_policy.py b/rllib/agents/impala/vtrace_policy.py index cc24f8c452a87..783bc2cde4128 100644 --- a/rllib/agents/impala/vtrace_policy.py +++ b/rllib/agents/impala/vtrace_policy.py @@ -16,7 +16,7 @@ from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.policy.tf_policy import LearningRateSchedule, \ - EntropyCoeffSchedule, ACTION_PROB + EntropyCoeffSchedule, ACTION_LOGP from ray.rllib.utils.explained_variance import explained_variance from ray.rllib.utils import try_import_tf @@ -82,7 +82,7 @@ def __init__(self, # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( - behaviour_action_logp=behaviour_action_logp, + behaviour_action_log_probs=behaviour_action_logp, behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, actions=tf.unstack(actions, axis=2), @@ -169,7 +169,7 @@ def make_time_major(*args, **kw): actions = batch_tensors[SampleBatch.ACTIONS] dones = batch_tensors[SampleBatch.DONES] rewards = batch_tensors[SampleBatch.REWARDS] - behaviour_action_logp = batch_tensors[ACTION_PROB] + behaviour_action_logp = batch_tensors[ACTION_LOGP] behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS] unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) diff --git a/rllib/examples/autoregressive_action_dist.py b/rllib/examples/autoregressive_action_dist.py index 40296afba460d..819341594f2c4 100644 --- a/rllib/examples/autoregressive_action_dist.py +++ b/rllib/examples/autoregressive_action_dist.py @@ -78,7 +78,7 @@ def sample(self): # sample a2 conditioned on a1 a2_dist = self._a2_distribution(a1) a2 = a2_dist.sample() - self._action_prob = a1_dist.logp(a1) + a2_dist.logp(a2) + self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2) # return the action tuple return TupleActions([a1, a2]) diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py index 05d45e74306fa..f04bb457427b6 100644 --- a/rllib/models/modelv2.py +++ b/rllib/models/modelv2.py @@ -166,7 +166,3 @@ def __call__(self, input_dict, state=None, seq_lens=None): raise ValueError("State output is not a list: {}".format(state)) return outputs, state - - def override_action_distribution(self): - """TODO(ekl) replace this with custom action distributions.""" - return None diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index c3ada9432338a..e485392eea0c8 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -145,14 +145,6 @@ def __init__(self, self.config["model"], framework="tf") - override_dist = self.model.override_action_distribution() - if override_dist is not None: - if action_sampler_fn: - raise ValueError( - "this policy doesn't use action dist classes for " - "sampling actions, so you cannot override it") - self.dist_class = override_dist - if existing_inputs: self.state_in = [ v for k, v in existing_inputs.items() From ba1b53102da9c1d6b099289a4c39fd08946d8cc3 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 14:04:48 -0700 Subject: [PATCH 22/33] fix op leak --- rllib/policy/tf_policy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index 0232a67a25118..2b51702d928df 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -117,6 +117,7 @@ def __init__(self, self._sampler = action_sampler self._is_training = self._get_is_training_placeholder() self._action_logp = action_logp + self._action_prob = tf.exp(self._action_logp) self._state_inputs = state_inputs or [] self._state_outputs = state_outputs or [] self._seq_lens = seq_lens @@ -307,7 +308,7 @@ def extra_compute_action_fetches(self): """ if self._action_logp is not None: return { - ACTION_PROB: tf.exp(self._action_logp), + ACTION_PROB: self._action_prob, ACTION_LOGP: self._action_logp, } else: From b551fcb936d6478cab77b512fd7d2b70cf0e0a35 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 15:14:44 -0700 Subject: [PATCH 23/33] fix --- rllib/agents/dqn/dqn_policy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rllib/agents/dqn/dqn_policy.py b/rllib/agents/dqn/dqn_policy.py index 5ee0dd1037374..c8dbfb23becf1 100644 --- a/rllib/agents/dqn/dqn_policy.py +++ b/rllib/agents/dqn/dqn_policy.py @@ -260,7 +260,8 @@ def build_q_networks(policy, q_model, input_dict, obs_space, action_space, config["model"]) policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob - return policy.output_actions, tf.log(policy.action_prob) + return (policy.output_actions, tf.log(policy.action_prob) + if policy.action_prob else None) def _build_parameter_noise(policy, pnet_params): From 4c4786a6d0d2e90afe34e1f6779d827ca85bf108 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 15:15:37 -0700 Subject: [PATCH 24/33] fix --- rllib/policy/tf_policy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index 2b51702d928df..f81f5e5c07d69 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -117,7 +117,8 @@ def __init__(self, self._sampler = action_sampler self._is_training = self._get_is_training_placeholder() self._action_logp = action_logp - self._action_prob = tf.exp(self._action_logp) + self._action_prob = (tf.exp(self._action_logp) + if self._action_logp else None) self._state_inputs = state_inputs or [] self._state_outputs = state_outputs or [] self._seq_lens = seq_lens From ba007f19bbebf263597f2eb622a1323b36e2e489 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 16:14:09 -0700 Subject: [PATCH 25/33] lint --- rllib/agents/dqn/dqn_policy.py | 2 +- rllib/policy/tf_policy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rllib/agents/dqn/dqn_policy.py b/rllib/agents/dqn/dqn_policy.py index c8dbfb23becf1..5d07edf77bb15 100644 --- a/rllib/agents/dqn/dqn_policy.py +++ b/rllib/agents/dqn/dqn_policy.py @@ -261,7 +261,7 @@ def build_q_networks(policy, q_model, input_dict, obs_space, action_space, policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob return (policy.output_actions, tf.log(policy.action_prob) - if policy.action_prob else None) + if policy.action_prob is not None else None) def _build_parameter_noise(policy, pnet_params): diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index f81f5e5c07d69..46c44af10ebc8 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -118,7 +118,7 @@ def __init__(self, self._is_training = self._get_is_training_placeholder() self._action_logp = action_logp self._action_prob = (tf.exp(self._action_logp) - if self._action_logp else None) + if self._action_logp is not None else None) self._state_inputs = state_inputs or [] self._state_outputs = state_outputs or [] self._seq_lens = seq_lens From 1134d17ef24dcf6ec4137f22b80c1e662e3cc1b8 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 16:25:42 -0700 Subject: [PATCH 26/33] doc --- doc/source/rllib-algorithms.rst | 4 ++-- doc/source/rllib.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst index 26e29bd92f042..298bd7cba5439 100644 --- a/doc/source/rllib-algorithms.rst +++ b/doc/source/rllib-algorithms.rst @@ -300,8 +300,8 @@ Tuned examples: `Two-step game `__ `[implementation] `__ MADDPG is a specialized multi-agent algorithm. Code here is adapted from https://github.com/openai/maddpg to integrate with RLlib multi-agent APIs. Please check `wsjeon/maddpg-rllib `__ for examples and more information. **MADDPG-specific configs** (see also `common configs `__): diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index a030d36d4a0a8..6bc2789404e90 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -87,7 +87,7 @@ Algorithms * Multi-agent specific - `QMIX Monotonic Value Factorisation (QMIX, VDN, IQN) `__ - - `Multi-Agent Actor Critic (contrib/MADDPG) `__ + - `Multi-Agent Deep Deterministic Policy Gradient (contrib/MADDPG) `__ * Offline From 2d946f1da23ad7be68392e8ee6ee9753ccec2fdb Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 6 Aug 2019 20:16:03 -0700 Subject: [PATCH 27/33] cateogrical --- rllib/agents/dqn/dqn_policy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rllib/agents/dqn/dqn_policy.py b/rllib/agents/dqn/dqn_policy.py index 5d07edf77bb15..efbff971644bf 100644 --- a/rllib/agents/dqn/dqn_policy.py +++ b/rllib/agents/dqn/dqn_policy.py @@ -109,8 +109,7 @@ class QValuePolicy(object): def __init__(self, q_values, observations, num_actions, stochastic, eps, softmax, softmax_temp, model_config): if softmax: - action_dist = Categorical( - q_values / softmax_temp, model_config=model_config) + action_dist = Categorical(q_values / softmax_temp) self.action = action_dist.sample() self.action_prob = tf.exp(action_dist.sampled_action_logp()) return From da85071cce59d68bfc4029e743a8bb52f7b0bb40 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 7 Aug 2019 17:05:02 -0700 Subject: [PATCH 28/33] fix --- rllib/models/tf/tf_action_dist.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 95cb5340b1d6c..f3bc68467d522 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -89,8 +89,9 @@ def required_model_output_shape(action_space, model_config): class MultiCategorical(TFActionDistribution): """MultiCategorical distribution for MultiDiscrete action spaces.""" - def __init__(self, inputs, input_lens, model): - TFActionDistribution.__init__(self, inputs, model) + def __init__(self, inputs, model, input_lens): + # skip TFActionDistribution init + ActionDistribution.__init__(self, inputs, model) self.cats = [ Categorical(input_, model) for input_ in tf.split(inputs, input_lens, axis=1) @@ -207,7 +208,8 @@ class MultiActionDistribution(TFActionDistribution): def __init__(self, inputs, model, action_space, child_distributions, input_lens): - TFActionDistribution.__init__(self, inputs, model) + # skip TFActionDistribution init + ActionDistribution.__init__(self, inputs, model) self.input_lens = input_lens split_inputs = tf.split(inputs, self.input_lens, axis=1) child_list = [] From 7a66f099809609c3b4cd21982586be9d8c982b2f Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 8 Aug 2019 01:32:40 -0700 Subject: [PATCH 29/33] fix vtrace --- rllib/agents/impala/vtrace.py | 4 ---- rllib/agents/impala/vtrace_policy.py | 3 +-- rllib/agents/impala/vtrace_test.py | 11 +++-------- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/rllib/agents/impala/vtrace.py b/rllib/agents/impala/vtrace.py index d2baa03fce2a0..6edc9f571b837 100644 --- a/rllib/agents/impala/vtrace.py +++ b/rllib/agents/impala/vtrace.py @@ -78,7 +78,6 @@ def multi_log_probs_from_logits_and_actions(policy_logits, actions, dist_class, [T, B, ...] with actions. dist_class: Python class of the action distribution - config: Trainer config dict Returns: A list with length of ACTION_SPACE of float32 @@ -113,7 +112,6 @@ def from_logits(behaviour_policy_logits, rewards, values, bootstrap_value, - config, dist_class=Categorical, model=None, clip_rho_threshold=1.0, @@ -127,7 +125,6 @@ def from_logits(behaviour_policy_logits, rewards, values, bootstrap_value, - config, dist_class, model, clip_rho_threshold=clip_rho_threshold, @@ -152,7 +149,6 @@ def multi_from_logits(behaviour_policy_logits, rewards, values, bootstrap_value, - config, dist_class, model, behaviour_action_log_probs=None, diff --git a/rllib/agents/impala/vtrace_policy.py b/rllib/agents/impala/vtrace_policy.py index 783bc2cde4128..8fe8699c2b1c0 100644 --- a/rllib/agents/impala/vtrace_policy.py +++ b/rllib/agents/impala/vtrace_policy.py @@ -94,8 +94,7 @@ def __init__(self, model=model, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, - tf.float32), - config=config) + tf.float32)) self.value_targets = self.vtrace_returns.vs # The policy gradients loss diff --git a/rllib/agents/impala/vtrace_test.py b/rllib/agents/impala/vtrace_test.py index 9d88fefa96fcc..e1f39991b097b 100644 --- a/rllib/agents/impala/vtrace_test.py +++ b/rllib/agents/impala/vtrace_test.py @@ -98,7 +98,7 @@ def test_log_probs_from_logits_and_actions(self, batch_size): 0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32) action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions( - policy_logits, actions, {"model": None}) # dummy config dict + policy_logits, actions) # Ground Truth # Using broadcasting to create a mask that indexes action logits @@ -159,8 +159,6 @@ def test_vtrace_from_logits(self, batch_size): clip_rho_threshold = None # No clipping. clip_pg_rho_threshold = None # No clipping. - dummy_config = {"model": None} - # Intentionally leaving shapes unspecified to test if V-trace can # deal with that. placeholders = { @@ -180,15 +178,12 @@ def test_vtrace_from_logits(self, batch_size): from_logits_output = vtrace.from_logits( clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, - config=dummy_config, **placeholders) target_log_probs = vtrace.log_probs_from_logits_and_actions( - placeholders["target_policy_logits"], placeholders["actions"], - dummy_config) + placeholders["target_policy_logits"], placeholders["actions"]) behaviour_log_probs = vtrace.log_probs_from_logits_and_actions( - placeholders["behaviour_policy_logits"], placeholders["actions"], - dummy_config) + placeholders["behaviour_policy_logits"], placeholders["actions"]) log_rhos = target_log_probs - behaviour_log_probs ground_truth = (log_rhos, behaviour_log_probs, target_log_probs) From 406620d78ce33692d6c6598b81d80ff55ba3b0d5 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 8 Aug 2019 14:12:04 -0700 Subject: [PATCH 30/33] fix appo --- rllib/agents/ppo/appo_policy.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/rllib/agents/ppo/appo_policy.py b/rllib/agents/ppo/appo_policy.py index 914c6757001a4..6ecc8189a814f 100644 --- a/rllib/agents/ppo/appo_policy.py +++ b/rllib/agents/ppo/appo_policy.py @@ -112,7 +112,6 @@ def __init__(self, rewards, values, bootstrap_value, - config, dist_class, model, valid_mask, @@ -145,7 +144,6 @@ def __init__(self, rewards: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. bootstrap_value: A float32 tensor of shape [B]. - config: Trainer config dict. dist_class: action distribution class for logits. model: backing ModelV2 instance valid_mask: A bool tensor of valid RNN input elements (#2992). @@ -169,7 +167,6 @@ def reduce_mean_valid(t): rewards=rewards, values=values, bootstrap_value=bootstrap_value, - config=config, dist_class=dist_class, model=model, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), @@ -305,7 +302,6 @@ def make_time_major(*args, **kw): rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], - config=policy.config, dist_class=Categorical if is_multidiscrete else policy.dist_class, model=policy.model, valid_mask=make_time_major(mask, drop_last=True), From fe8ecffb65207535e66282f3addfd950cd2a9c00 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 9 Aug 2019 14:32:32 -0700 Subject: [PATCH 31/33] to note --- doc/source/rllib-models.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index e6fbcc1170663..d75798c95c62e 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -168,7 +168,7 @@ Custom preprocessors should subclass the RLlib `preprocessor class `__. Not all algorithms support custom action distributions; see the `feature compatibility matrix `__. +Similar to custom models and preprocessors, you can also specify a custom action distribution class as follows. The action dist class is passed a reference to the ``model``, which you can use to access ``model.model_config`` or other attributes of the model. This is commonly used to implement `autoregressive action outputs <#autoregressive-action-distributions>`__. .. code-block:: python @@ -377,4 +377,8 @@ To do this, you need both a custom model that implements the autoregressive patt self.action_model.summary() self.register_variables(self.action_model.variables) - Not all algorithms support custom action distributions; see the `feature compatibility matrix `__. + + +.. note:: + + Not all algorithms support autoregressive action distributions; see the `feature compatibility matrix `__. From 7e3f040691aebf5041b7e3392183686459c74a52 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 9 Aug 2019 14:35:16 -0700 Subject: [PATCH 32/33] comments --- rllib/agents/dqn/dqn_policy.py | 6 ++++-- rllib/models/tf/tf_action_dist.py | 2 +- rllib/policy/tf_policy.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/rllib/agents/dqn/dqn_policy.py b/rllib/agents/dqn/dqn_policy.py index efbff971644bf..168e453487ccc 100644 --- a/rllib/agents/dqn/dqn_policy.py +++ b/rllib/agents/dqn/dqn_policy.py @@ -259,8 +259,10 @@ def build_q_networks(policy, q_model, input_dict, obs_space, action_space, config["model"]) policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob - return (policy.output_actions, tf.log(policy.action_prob) - if policy.action_prob is not None else None) + actions = policy.output_actions + action_prob = (tf.log(policy.action_prob) + if policy.action_prob is not None else None) + return actions, action_prob def _build_parameter_noise(policy, pnet_params): diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index f3bc68467d522..9474d278543cd 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -187,7 +187,7 @@ class Deterministic(TFActionDistribution): @override(TFActionDistribution) def sampled_action_logp(self): - return np.log(1.0) + return 0.0 @override(TFActionDistribution) def _build_sample_op(self): diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index 46c44af10ebc8..755aed69327bd 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -90,7 +90,7 @@ def __init__(self, placeholders during loss computation. model (rllib.models.Model): used to integrate custom losses and stats from user-defined RLlib models. - action_logp (Tensor): probability of the sampled action. + action_logp (Tensor): log probability of the sampled action. state_inputs (list): list of RNN state input Tensors. state_outputs (list): list of RNN state output Tensors. prev_action_input (Tensor): placeholder for previous actions From 979fd5a79c1b0f65ee0781a56c162ad10a69f9e4 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 9 Aug 2019 20:36:25 -0700 Subject: [PATCH 33/33] fix merge --- rllib/examples/centralized_critic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rllib/examples/centralized_critic.py b/rllib/examples/centralized_critic.py index 7e9495204465d..9d07df5af6cfd 100644 --- a/rllib/examples/centralized_critic.py +++ b/rllib/examples/centralized_critic.py @@ -28,7 +28,7 @@ from ray.rllib.models import ModelCatalog from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy import LearningRateSchedule, \ - EntropyCoeffSchedule + EntropyCoeffSchedule, ACTION_LOGP from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork from ray.rllib.utils.explained_variance import explained_variance @@ -141,10 +141,13 @@ def loss_with_central_critic(policy, batch_tensors): policy.loss_obj = PPOLoss( policy.action_space, + policy.dist_class, + policy.model, batch_tensors[Postprocessing.VALUE_TARGETS], batch_tensors[Postprocessing.ADVANTAGES], batch_tensors[SampleBatch.ACTIONS], batch_tensors[BEHAVIOUR_LOGITS], + batch_tensors[ACTION_LOGP], batch_tensors[SampleBatch.VF_PREDS], policy.action_dist, policy.central_value_function,