Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom action distributions #5164

Merged
merged 22 commits into from
Aug 6, 2019
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
5176959
custom action dist wip
mawright Jun 2, 2019
8c4d684
Test case for custom action dist
mawright Jun 2, 2019
508ed4a
ActionDistribution.get_parameter_shape_for_action_space pattern
mawright Jul 8, 2019
7d0ae68
Edit exception message to also suggest using a custom action distribu…
mawright Jul 8, 2019
289141c
Clean up ModelCatalog.get_action_dist
mawright Jul 9, 2019
33c3907
Pass model config to ActionDistribution constructors
mawright Jul 9, 2019
ce720a3
Update custom action distribution test case
mawright Jul 10, 2019
d0b8a64
Name fix
mawright Jul 10, 2019
c3ec408
Autoformatter
mawright Jul 10, 2019
f78f447
parameter shape static methods for torch distributions
mawright Jul 10, 2019
489c573
Fix docstring
mawright Jul 10, 2019
ff5076e
Generalize fake array for graph initialization
mawright Jul 30, 2019
bfdfa90
Merge branch 'master' into action_dist
mawright Aug 1, 2019
11243cd
Fix action dist constructors
mawright Aug 1, 2019
a9939d4
Correct parameter shape static methods for multicategorical and gaussian
mawright Aug 1, 2019
96bba6c
Make suggested changes to custom action dist's
mawright Aug 1, 2019
8158f24
Correct instances of not passing model config to action dist
mawright Aug 2, 2019
f11fbca
Autoformatter
mawright Aug 2, 2019
bd378b0
fix tuple distribution constructor
mawright Aug 2, 2019
2f39c88
bugfix
mawright Aug 3, 2019
a1321a8
Merge remote-tracking branch 'upstream/master' into action_dist
ericl Aug 5, 2019
1b2eb98
Merge remote-tracking branch 'upstream/master' into action_dist
ericl Aug 6, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/ray/rllib/agents/ars/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def __init__(self,
model = ModelCatalog.get_model({
"obs": self.inputs
}, obs_space, action_space, dist_dim, model_config)
dist = dist_class(model.outputs)
dist = dist_class(model.outputs, model_config=model_config)
self.sampler = dist.sample()

self.variables = ray.experimental.tf_utils.TensorFlowVariables(
Expand Down
2 changes: 1 addition & 1 deletion python/ray/rllib/agents/es/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(self, sess, action_space, obs_space, preprocessor,
model = ModelCatalog.get_model({
"obs": self.inputs
}, obs_space, action_space, dist_dim, model_options)
dist = dist_class(model.outputs)
dist = dist_class(model.outputs, model_config=model_options)
self.sampler = dist.sample()

self.variables = ray.experimental.tf_utils.TensorFlowVariables(
Expand Down
11 changes: 6 additions & 5 deletions python/ray/rllib/agents/marwil/marwil_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self, state_values, cumulative_rewards):

class ReweightedImitationLoss(object):
def __init__(self, state_values, cumulative_rewards, logits, actions,
action_space, beta):
action_space, beta, model_config):
ma_adv_norm = tf.get_variable(
name="moving_average_of_advantage_norm",
dtype=tf.float32,
Expand All @@ -48,8 +48,8 @@ def __init__(self, state_values, cumulative_rewards, logits, actions,
beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm)))

# log\pi_\theta(a|s)
dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
action_dist = dist_cls(logits)
dist_cls, _ = ModelCatalog.get_action_dist(action_space, model_config)
action_dist = dist_cls(logits, model_config=model_config)
logprobs = action_dist.logp(actions)

self.loss = -1.0 * tf.reduce_mean(
Expand Down Expand Up @@ -106,7 +106,7 @@ def __init__(self, observation_space, action_space, config):
self.p_func_vars = scope_vars(scope.name)

# Action outputs
action_dist = dist_cls(logits)
action_dist = dist_cls(logits, model_config=self.config["model"])
self.output_actions = action_dist.sample()

# Training inputs
Expand Down Expand Up @@ -164,7 +164,8 @@ def _build_value_loss(self, state_values, cum_rwds):
def _build_policy_loss(self, state_values, cum_rwds, logits, actions,
action_space):
return ReweightedImitationLoss(state_values, cum_rwds, logits, actions,
action_space, self.config["beta"])
action_space, self.config["beta"],
self.config["model"])

@override(TFPolicy)
def extra_compute_grad_fetches(self):
Expand Down
12 changes: 8 additions & 4 deletions python/ray/rllib/agents/ppo/ppo_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def __init__(self,
clip_param=0.1,
vf_clip_param=0.1,
vf_loss_coeff=1.0,
use_gae=True):
use_gae=True,
model_config=None):
"""Constructs the loss for Proximal Policy Objective.

Arguments:
Expand All @@ -64,13 +65,15 @@ def __init__(self,
vf_clip_param (float): Clip parameter for the value function
vf_loss_coeff (float): Coefficient of the value function loss
use_gae (bool): If true, use the Generalized Advantage Estimator.
model_config (dict): (Optional) model config for use in specifying
action distributions.
"""

def reduce_mean_valid(t):
return tf.reduce_mean(tf.boolean_mask(t, valid_mask))

dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
prev_dist = dist_cls(logits)
dist_cls, _ = ModelCatalog.get_action_dist(action_space, model_config)
prev_dist = dist_cls(logits, model_config=model_config)
# Make loss functions.
logp_ratio = tf.exp(
curr_action_dist.logp(actions) - prev_dist.logp(actions))
Expand Down Expand Up @@ -129,7 +132,8 @@ def ppo_surrogate_loss(policy, batch_tensors):
clip_param=policy.config["clip_param"],
vf_clip_param=policy.config["vf_clip_param"],
vf_loss_coeff=policy.config["vf_loss_coeff"],
use_gae=policy.config["use_gae"])
use_gae=policy.config["use_gae"],
model_config=policy.config["model"])

return policy.loss_obj.loss

Expand Down
68 changes: 60 additions & 8 deletions python/ray/rllib/models/action_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,14 @@ class ActionDistribution(object):

Args:
inputs (Tensor): The input vector to compute samples from.
model_config (dict): Optional model config dict
(as defined in catalog.py)
"""

@DeveloperAPI
def __init__(self, inputs):
def __init__(self, inputs, model_config=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid accidents where we forget to pass this, consider making it a required argument.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I lean towards making it required but this may break other users' code.

Copy link
Contributor

@ericl ericl Jul 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is DeveloperAPI I would say it's ok to err on the side of avoiding bugs vs backwards compatibility.

self.inputs = inputs
self.model_config = model_config
self.sample_op = self._build_sample_op()

@DeveloperAPI
Expand Down Expand Up @@ -69,6 +72,25 @@ def sampled_action_prob(self):
"""Returns the log probability of the sampled action."""
return tf.exp(self.logp(self.sample_op))

@DeveloperAPI
@staticmethod
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
@staticmethod
@classmethod

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand this suggestion. Why do you think this should be a class method?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm I guess staticmethod is fine, since you don't really need the class.

def parameter_shape_for_action_space(action_space, model_config=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about required_model_output_size?

"""Returns the required shape of an input parameter tensor for a
particular action space and an optional dict of distribution-specific
options.

Args:
action_space (gym.Space): The action space this distribution will
be used for, whose shape attributes will be used to determine
the required shape of the input parameter tensor.
model_config (dict): Model's config dict (as defined in catalog.py)

Returns:
dist_dim (int or np.ndarray of ints): size of the required
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dist_dim (int or np.ndarray of ints): size of the required
model_output_size (int or np.ndarray of ints): size of the required

input vector (minus leading batch dimension).
"""
raise NotImplementedError


class Categorical(ActionDistribution):
"""Categorical distribution for discrete action spaces."""
Expand Down Expand Up @@ -122,16 +144,22 @@ def kl(self, other):
def _build_sample_op(self):
return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1)

@staticmethod
@override(ActionDistribution)
def parameter_shape_for_action_space(action_space, model_config=None):
return action_space.n


class MultiCategorical(ActionDistribution):
"""Categorical distribution for discrete action spaces."""

def __init__(self, inputs, input_lens):
def __init__(self, inputs, input_lens, model_config=None):
self.cats = [
Categorical(input_)
for input_ in tf.split(inputs, input_lens, axis=1)
]
self.sample_op = self._build_sample_op()
self.model_config = model_config

def logp(self, actions):
# If tensor is provided, unstack it into list
Expand All @@ -158,12 +186,12 @@ class DiagGaussian(ActionDistribution):
second half the gaussian standard deviations.
"""

def __init__(self, inputs):
def __init__(self, inputs, model_config=None):
mean, log_std = tf.split(inputs, 2, axis=1)
self.mean = mean
self.log_std = log_std
self.std = tf.exp(log_std)
ActionDistribution.__init__(self, inputs)
ActionDistribution.__init__(self, inputs, model_config)

@override(ActionDistribution)
def logp(self, x):
Expand Down Expand Up @@ -191,6 +219,11 @@ def entropy(self):
def _build_sample_op(self):
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))

@staticmethod
@override(ActionDistribution)
def parameter_shape_for_action_space(action_space, model_config=None):
return action_space.shape[0] * 2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np.product(action_space.shape)? here and elsewhere?



class Deterministic(ActionDistribution):
"""Action distribution that returns the input values directly.
Expand All @@ -206,21 +239,35 @@ def sampled_action_prob(self):
def _build_sample_op(self):
return self.inputs

@staticmethod
@override(ActionDistribution)
def parameter_shape_for_action_space(action_space, model_config=None):
return action_space.shape[0]


class MultiActionDistribution(ActionDistribution):
"""Action distribution that operates for list of actions.

Args:
inputs (Tensor list): A list of tensors from which to compute samples.
model_config (dict): Config dict for the model (as defined in
catalog.py)
"""

def __init__(self, inputs, action_space, child_distributions, input_lens):
def __init__(self,
inputs,
action_space,
child_distributions,
input_lens,
model_config=None):
self.input_lens = input_lens
split_inputs = tf.split(inputs, self.input_lens, axis=1)
child_list = []
for i, distribution in enumerate(child_distributions):
child_list.append(distribution(split_inputs[i]))
child_list.append(
distribution(split_inputs[i], model_config=model_config))
self.child_distributions = child_list
self.model_config = model_config

@override(ActionDistribution)
def logp(self, x):
Expand Down Expand Up @@ -278,7 +325,7 @@ class Dirichlet(ActionDistribution):

e.g. actions that represent resource allocation."""

def __init__(self, inputs):
def __init__(self, inputs, model_config=None):
"""Input is a tensor of logits. The exponential of logits is used to
parametrize the Dirichlet distribution as all parameters need to be
positive. An arbitrary small epsilon is added to the concentration
Expand All @@ -293,7 +340,7 @@ def __init__(self, inputs):
validate_args=True,
allow_nan_stats=False,
)
ActionDistribution.__init__(self, concentration)
ActionDistribution.__init__(self, concentration, model_config)

@override(ActionDistribution)
def logp(self, x):
Expand All @@ -315,3 +362,8 @@ def kl(self, other):
@override(ActionDistribution)
def _build_sample_op(self):
return self.dist.sample()

@staticmethod
@override(ActionDistribution)
def parameter_shape_for_action_space(action_space, model_config=None):
return action_space.shape[0]
71 changes: 48 additions & 23 deletions python/ray/rllib/models/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from functools import partial

from ray.tune.registry import RLLIB_MODEL, RLLIB_PREPROCESSOR, \
_global_registry
RLLIB_ACTION_DIST, _global_registry

from ray.rllib.models.extra_spaces import Simplex
from ray.rllib.models.action_dist import (Categorical, MultiCategorical,
Expand Down Expand Up @@ -80,6 +80,8 @@
"custom_preprocessor": None,
# Name of a custom model to use
"custom_model": None,
# Name of a custom action distribution to use
"custom_action_dist": None,
# Extra options to pass to the custom classes
"custom_options": {},
}
Expand Down Expand Up @@ -119,46 +121,57 @@ def get_action_dist(action_space, config, dist_type=None, torch=False):
"""

config = config or MODEL_DEFAULTS
if isinstance(action_space, gym.spaces.Box):
if config.get("custom_action_dist"):
action_dist_name = config["custom_action_dist"]
logger.debug(
"Using custom action distribution {}".format(action_dist_name))
dist = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name)

elif isinstance(action_space, gym.spaces.Box):
if len(action_space.shape) > 1:
raise UnsupportedSpaceException(
"Action space has multiple dimensions "
"{}. ".format(action_space.shape) +
"Consider reshaping this into a single dimension, "
"using a custom action distribution, "
"using a Tuple action space, or the multi-agent API.")
if dist_type is None:
dist = TorchDiagGaussian if torch else DiagGaussian
return dist, action_space.shape[0] * 2
elif dist_type == "deterministic":
return Deterministic, action_space.shape[0]
dist = Deterministic
elif isinstance(action_space, gym.spaces.Discrete):
dist = TorchCategorical if torch else Categorical
return dist, action_space.n
elif isinstance(action_space, gym.spaces.Tuple):
if torch:
raise NotImplementedError("Tuple action spaces not supported "
"for Pytorch.")
child_dist = []
input_lens = []
for action in action_space.spaces:
dist, action_size = ModelCatalog.get_action_dist(
action, config)
child_dist.append(dist)
input_lens.append(action_size)
if torch:
raise NotImplementedError
return partial(
MultiActionDistribution,
child_distributions=child_dist,
action_space=action_space,
input_lens=input_lens), sum(input_lens)
elif isinstance(action_space, Simplex):
if torch:
raise NotImplementedError
return Dirichlet, action_space.shape[0]
elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete):
raise NotImplementedError("Simplex action spaces not "
"supported for Pytorch.")
dist = Dirichlet
elif isinstance(action_space, gym.spaces.MultiDiscrete):
if torch:
raise NotImplementedError
raise NotImplementedError("MultiDiscrete action spaces not "
"supported for Pytorch.")
return partial(MultiCategorical, input_lens=action_space.nvec), \
int(sum(action_space.nvec))

return dist, dist.parameter_shape_for_action_space(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like now you could simplify this to return just dist (or this cleanup can be done later).

action_space, config)

raise NotImplementedError("Unsupported args: {} {}".format(
action_space, dist_type))

Expand All @@ -173,11 +186,16 @@ def get_action_placeholder(action_space):
action_placeholder (Tensor): A placeholder for the actions
"""

if isinstance(action_space, gym.spaces.Box):
return tf.placeholder(
tf.float32, shape=(None, action_space.shape[0]), name="action")
elif isinstance(action_space, gym.spaces.Discrete):
if isinstance(action_space, gym.spaces.Discrete):
return tf.placeholder(tf.int64, shape=(None, ), name="action")
elif isinstance(action_space, (gym.spaces.Box, Simplex)):
return tf.placeholder(
tf.float32, shape=(None, ) + action_space.shape, name="action")
elif isinstance(action_space, gym.spaces.MultiDiscrete):
return tf.placeholder(
tf.as_dtype(action_space.dtype),
shape=(None, ) + action_space.shape,
name="action")
elif isinstance(action_space, gym.spaces.Tuple):
size = 0
all_discrete = True
Expand All @@ -191,14 +209,6 @@ def get_action_placeholder(action_space):
tf.int64 if all_discrete else tf.float32,
shape=(None, size),
name="action")
elif isinstance(action_space, Simplex):
return tf.placeholder(
tf.float32, shape=(None, action_space.shape[0]), name="action")
elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete):
return tf.placeholder(
tf.as_dtype(action_space.dtype),
shape=(None, len(action_space.nvec)),
name="action")
else:
raise NotImplementedError("action space {}"
" not supported".format(action_space))
Expand Down Expand Up @@ -476,3 +486,18 @@ def register_custom_model(model_name, model_class):
model_class (type): Python class of the model.
"""
_global_registry.register(RLLIB_MODEL, model_name, model_class)

@staticmethod
@PublicAPI
def register_custom_action_dist(action_dist_name, action_dist_class):
"""Register a custom action distribution class by name.

The model can be later used by specifying
{"custom_action_dist": action_dist_name} in the model config.

Args:
model_name (str): Name to register the model under.
model_class (type): Python class of the model.
"""
_global_registry.register(RLLIB_ACTION_DIST, action_dist_name,
action_dist_class)
Loading