diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 09c49e2751bf..9e7070b66c48 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -233,7 +233,7 @@ In this example we overrode existing methods of the existing DDPG policy graph, Variable-length / Parametric Action Spaces ------------------------------------------ -Custom models can be used to work with environments where (1) the set of valid actions varies per step, and/or (2) the number of valid actions is very large, as in `OpenAI Five `__ and `Horizon `__. The general idea is that the meaning of actions can be completely conditioned on the observation, that is, the ``a`` in ``Q(s, a)`` is just a token in ``[0, MAX_AVAIL_ACTIONS)`` that only has meaning in the context of ``s``. This works with algorithms in the `DQN and policy-gradient families `__ and can be implemented as follows: +Custom models can be used to work with environments where (1) the set of valid actions varies per step, and/or (2) the number of valid actions is very large, as in `OpenAI Five `__ and `Horizon `__. The general idea is that the meaning of actions can be completely conditioned on the observation, i.e., the ``a`` in ``Q(s, a)`` becomes just a token in ``[0, MAX_AVAIL_ACTIONS)`` that only has meaning in the context of ``s``. This works with algorithms in the `DQN and policy-gradient families `__ and can be implemented as follows: 1. The environment should return a mask and/or list of valid action embeddings as part of the observation for each step. To enable batching, the number of actions can be allowed to vary from 1 to some max number: diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py index f0d9510756b9..9dbc267cf5bb 100644 --- a/python/ray/rllib/agents/agent.py +++ b/python/ray/rllib/agents/agent.py @@ -61,6 +61,8 @@ # Whether to clip rewards prior to experience postprocessing. Setting to # None means clip for Atari only. "clip_rewards": None, + # Whether to np.clip() actions to the action space low/high range spec. + "clip_actions": True, # Whether to use rllib or deepmind preprocessors by default "preprocessor_pref": "deepmind", @@ -226,6 +228,7 @@ def session_creator(): num_envs=config["num_envs_per_worker"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], + clip_actions=config["clip_actions"], env_config=config["env_config"], model_config=config["model"], policy_config=config, diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py index db5f7ee887b2..33d5ee219ca1 100644 --- a/python/ray/rllib/evaluation/policy_evaluator.py +++ b/python/ray/rllib/evaluation/policy_evaluator.py @@ -100,6 +100,7 @@ def __init__(self, num_envs=1, observation_filter="NoFilter", clip_rewards=None, + clip_actions=True, env_config=None, model_config=None, policy_config=None, @@ -155,6 +156,8 @@ def __init__(self, clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to experience postprocessing. Setting to None means clip for Atari only. + clip_actions (bool): Whether to clip action values to the range + specified by the policy action space. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. In the @@ -289,7 +292,8 @@ def make_env(vector_index): self.callbacks, horizon=episode_horizon, pack=pack_episodes, - tf_sess=self.tf_sess) + tf_sess=self.tf_sess, + clip_actions=clip_actions) self.sampler.start() else: self.sampler = SyncSampler( @@ -302,7 +306,8 @@ def make_env(vector_index): self.callbacks, horizon=episode_horizon, pack=pack_episodes, - tf_sess=self.tf_sess) + tf_sess=self.tf_sess, + clip_actions=clip_actions) logger.debug("Created evaluator with env {} ({}), policies {}".format( self.async_env, self.env, self.policy_map)) diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py index 2c6411f33510..0bda18bc0361 100644 --- a/python/ray/rllib/evaluation/sampler.py +++ b/python/ray/rllib/evaluation/sampler.py @@ -2,6 +2,7 @@ from __future__ import division from __future__ import print_function +import gym from collections import defaultdict, namedtuple import logging import numpy as np @@ -47,7 +48,8 @@ def __init__(self, callbacks, horizon=None, pack=False, - tf_sess=None): + tf_sess=None, + clip_actions=True): self.async_vector_env = AsyncVectorEnv.wrap_async(env) self.unroll_length = unroll_length self.horizon = horizon @@ -58,7 +60,8 @@ def __init__(self, self.rollout_provider = _env_runner( self.async_vector_env, self.extra_batches.put, self.policies, self.policy_mapping_fn, self.unroll_length, self.horizon, - self._obs_filters, clip_rewards, pack, callbacks, tf_sess) + self._obs_filters, clip_rewards, clip_actions, pack, callbacks, + tf_sess) self.metrics_queue = queue.Queue() def get_data(self): @@ -104,7 +107,8 @@ def __init__(self, callbacks, horizon=None, pack=False, - tf_sess=None): + tf_sess=None, + clip_actions=True): for _, f in obs_filters.items(): assert getattr(f, "is_concurrent", False), \ "Observation Filter must support concurrent updates." @@ -123,6 +127,7 @@ def __init__(self, self.pack = pack self.tf_sess = tf_sess self.callbacks = callbacks + self.clip_actions = clip_actions def run(self): try: @@ -135,8 +140,8 @@ def _run(self): rollout_provider = _env_runner( self.async_vector_env, self.extra_batches.put, self.policies, self.policy_mapping_fn, self.unroll_length, self.horizon, - self._obs_filters, self.clip_rewards, self.pack, self.callbacks, - self.tf_sess) + self._obs_filters, self.clip_rewards, self.clip_actions, self.pack, + self.callbacks, self.tf_sess) while True: # The timeout variable exists because apparently, if one worker # dies, the other workers won't die with it, unless the timeout is @@ -197,6 +202,7 @@ def _env_runner(async_vector_env, horizon, obs_filters, clip_rewards, + clip_actions, pack, callbacks, tf_sess=None): @@ -217,6 +223,7 @@ def _env_runner(async_vector_env, clip_rewards (bool): Whether to clip rewards before postprocessing. pack (bool): Whether to pack multiple episodes into each batch. This guarantees batches will be exactly `unroll_length` in size. + clip_actions (bool): Whether to clip actions to the space range. callbacks (dict): User callbacks to run on episode events. tf_sess (Session|None): Optional tensorflow session to use for batching TF policy evaluations. @@ -272,7 +279,7 @@ def new_episode(): # Do batched policy eval eval_results = _do_policy_eval(tf_sess, to_eval, policies, - active_episodes) + active_episodes, clip_actions) # Process results and update episode state actions_to_send = _process_policy_eval_results( @@ -413,7 +420,7 @@ def _process_observations(async_vector_env, policies, batch_builder_pool, return active_envs, to_eval, outputs -def _do_policy_eval(tf_sess, to_eval, policies, active_episodes): +def _do_policy_eval(tf_sess, to_eval, policies, active_episodes, clip_actions): """Call compute actions on observation batches to get next actions. Returns: @@ -448,6 +455,13 @@ def _do_policy_eval(tf_sess, to_eval, policies, active_episodes): for k, v in pending_fetches.items(): eval_results[k] = builder.get(v) + if clip_actions: + for policy_id, results in eval_results.items(): + policy = _get_or_raise(policies, policy_id) + actions, rnn_out_cols, pi_info_cols = results + eval_results[policy_id] = (_clip_actions( + actions, policy.action_space), rnn_out_cols, pi_info_cols) + return eval_results @@ -516,6 +530,31 @@ def _fetch_atari_metrics(async_vector_env): return atari_out +def _clip_actions(actions, space): + """Called to clip actions to the specified range of this policy. + + Arguments: + actions: Batch of actions or TupleActions. + space: Action space the actions should be present in. + + Returns: + Clipped batch of actions. + """ + + if isinstance(space, gym.spaces.Box): + return np.clip(actions, space.low, space.high) + elif isinstance(space, gym.spaces.Tuple): + if not isinstance(actions, TupleActions): + raise ValueError("Expected tuple space for actions {}: {}".format( + actions, space)) + out = [] + for a, s in zip(actions.batches, space.spaces): + out.append(_clip_actions(a, s)) + return TupleActions(out) + else: + return actions + + def _unbatch_tuple_actions(action_batch): # convert list of batches -> batch of lists if isinstance(action_batch, TupleActions): diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index 75a43deeb789..76d45e244151 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -95,19 +95,10 @@ class DiagGaussian(ActionDistribution): second half the gaussian standard deviations. """ - def __init__(self, inputs, low=None, high=None): + def __init__(self, inputs): ActionDistribution.__init__(self, inputs) mean, log_std = tf.split(inputs, 2, axis=1) self.mean = mean - self.low = low - self.high = high - - # Squash to range if specified. We use a sigmoid here this to avoid the - # mean drifting too far past the bounds and causing nan outputs. - # https://github.com/ray-project/ray/issues/1862 - if low is not None: - self.mean = low + tf.sigmoid(self.mean) * (high - low) - self.log_std = log_std self.std = tf.exp(log_std) @@ -131,10 +122,7 @@ def entropy(self): reduction_indices=[1]) def sample(self): - out = self.mean + self.std * tf.random_normal(tf.shape(self.mean)) - if self.low is not None: - out = tf.clip_by_value(out, self.low, self.high) - return out + return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) class Deterministic(ActionDistribution): @@ -147,34 +135,6 @@ def sample(self): return self.inputs -def squash_to_range(dist_cls, low, high): - """Squashes an action distribution to a range in (low, high). - - Arguments: - dist_cls (class): ActionDistribution class to wrap. - low (float|array): Scalar value or array of values. - high (float|array): Scalar value or array of values. - """ - - class SquashToRangeWrapper(dist_cls): - def __init__(self, inputs): - dist_cls.__init__(self, inputs, low=low, high=high) - - def logp(self, x): - return dist_cls.logp(self, x) - - def kl(self, other): - return dist_cls.kl(self, other) - - def entropy(self): - return dist_cls.entropy(self) - - def sample(self): - return dist_cls.sample(self) - - return SquashToRangeWrapper - - class MultiActionDistribution(ActionDistribution): """Action distribution that operates for list of actions. diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py index 63a7e73890cc..f9e8af2829f8 100644 --- a/python/ray/rllib/models/catalog.py +++ b/python/ray/rllib/models/catalog.py @@ -15,8 +15,7 @@ from ray.rllib.env.external_env import ExternalEnv from ray.rllib.env.vector_env import VectorEnv from ray.rllib.models.action_dist import ( - Categorical, Deterministic, DiagGaussian, MultiActionDistribution, - squash_to_range) + Categorical, Deterministic, DiagGaussian, MultiActionDistribution) from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.models.fcnet import FullyConnectedNetwork from ray.rllib.models.visionnet import VisionNetwork @@ -38,7 +37,7 @@ "fcnet_hiddens": [256, 256], # For control envs, documented in ray.rllib.models.Model "free_log_std": False, - # Whether to squash the action output to space range + # (deprecated) Whether to use sigmoid to squash actions to space range "squash_to_range": False, # == LSTM == @@ -114,8 +113,9 @@ def get_action_dist(action_space, config, dist_type=None): if dist_type is None: dist = DiagGaussian if config.get("squash_to_range"): - dist = squash_to_range(dist, action_space.low, - action_space.high) + raise ValueError( + "The squash_to_range option is deprecated. See the " + "clip_actions agent option instead.") return dist, action_space.shape[0] * 2 elif dist_type == "deterministic": return Deterministic, action_space.shape[0] diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py index b98a006bca3b..7a5e45ef3aa4 100644 --- a/python/ray/rllib/test/test_supported_spaces.py +++ b/python/ray/rllib/test/test_supported_spaces.py @@ -120,12 +120,15 @@ def testAll(self): stats, check_bounds=True) check_support("DQN", {"timesteps_per_iteration": 1}, stats) - check_support("A3C", { - "num_workers": 1, - "optimizer": { - "grads_per_step": 1 - } - }, stats) + check_support( + "A3C", { + "num_workers": 1, + "optimizer": { + "grads_per_step": 1 + } + }, + stats, + check_bounds=True) check_support( "PPO", { "num_workers": 1, @@ -133,9 +136,6 @@ def testAll(self): "train_batch_size": 10, "sample_batch_size": 10, "sgd_minibatch_size": 1, - "model": { - "squash_to_range": True - }, }, stats, check_bounds=True) @@ -153,7 +153,13 @@ def testAll(self): "num_rollouts": 1, "rollouts_used": 1 }, stats) - check_support("PG", {"num_workers": 1, "optimizer": {}}, stats) + check_support( + "PG", { + "num_workers": 1, + "optimizer": {} + }, + stats, + check_bounds=True) num_unexpected_errors = 0 for (alg, a_name, o_name), stat in sorted(stats.items()): if stat not in ["ok", "unsupported"]: diff --git a/python/ray/rllib/tuned_examples/pong-ppo.yaml b/python/ray/rllib/tuned_examples/pong-ppo.yaml index 1447481643fe..d7e273cc6e2b 100644 --- a/python/ray/rllib/tuned_examples/pong-ppo.yaml +++ b/python/ray/rllib/tuned_examples/pong-ppo.yaml @@ -1,17 +1,26 @@ -# On a Tesla K80 GPU, this achieves the maximum reward in about 1-1.5 hours. +# On a single GPU, this achieves maximum reward in ~15-20 minutes. # -# $ python train.py -f tuned_examples/pong-ppo.yaml --ray-num-gpus=1 +# $ python train.py -f tuned_examples/pong-ppo.yaml # -# - PPO_PongDeterministic-v4_0: TERMINATED [pid=16387], 4984 s, 1117981 ts, 21 rew -# - PPO_PongDeterministic-v4_0: TERMINATED [pid=83606], 4592 s, 1068671 ts, 21 rew -# -pong-deterministic-ppo: - env: PongDeterministic-v4 +pong-ppo: + env: PongNoFrameskip-v4 run: PPO - stop: - episode_reward_mean: 21 config: - gamma: 0.99 - num_workers: 4 - num_sgd_iter: 20 + lambda: 0.95 + kl_coeff: 0.5 + clip_rewards: True + clip_param: 0.1 + vf_clip_param: 10.0 + entropy_coeff: 0.01 + train_batch_size: 5000 + sample_batch_size: 20 + sgd_minibatch_size: 500 + num_sgd_iter: 10 + num_workers: 32 + num_envs_per_worker: 5 + batch_mode: truncate_episodes + observation_filter: NoFilter + vf_share_layers: true num_gpus: 1 + model: + dim: 42