diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py index a2441f0b5bf6..9086f968ed69 100644 --- a/python/ray/rllib/__init__.py +++ b/python/ray/rllib/__init__.py @@ -6,6 +6,11 @@ # This file is imported from the tune module in order to register RLlib agents. from ray.tune.registry import register_trainable +from ray.rllib.utils.policy_graph import PolicyGraph +from ray.rllib.utils.tf_policy_graph import TFPolicyGraph +from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator +from ray.rllib.optimizers.sample_batch import SampleBatch + def _register_all(): for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG", @@ -16,3 +21,7 @@ def _register_all(): _register_all() + +__all__ = [ + "PolicyGraph", "TFPolicyGraph", "CommonPolicyEvaluator", "SampleBatch" +] diff --git a/python/ray/rllib/a3c/a3c.py b/python/ray/rllib/a3c/a3c.py index 569b50c44420..8a2089db30c9 100644 --- a/python/ray/rllib/a3c/a3c.py +++ b/python/ray/rllib/a3c/a3c.py @@ -2,7 +2,6 @@ from __future__ import division from __future__ import print_function -import numpy as np import pickle import os @@ -10,14 +9,14 @@ from ray.rllib.agent import Agent from ray.rllib.optimizers import AsyncOptimizer from ray.rllib.utils import FilterManager -from ray.rllib.a3c.a3c_evaluator import A3CEvaluator, RemoteA3CEvaluator, \ - GPURemoteA3CEvaluator -from ray.tune.result import TrainingResult +from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \ + collect_metrics +from ray.rllib.a3c.common import get_policy_cls from ray.tune.trial import Resources DEFAULT_CONFIG = { # Number of workers (excluding master) - "num_workers": 4, + "num_workers": 2, # Size of rollout batch "batch_size": 10, # Use LSTM model - only applicable for image states @@ -42,6 +41,8 @@ "entropy_coeff": -0.01, # Whether to place workers on GPUs "use_gpu_for_workers": False, + # Whether to emit extra summary stats + "summarize": False, # Model and preprocessor options "model": { # (Image statespace) - Converts image to Channels = 1 @@ -78,56 +79,48 @@ def default_resource_request(cls, config): extra_gpu=cf["use_gpu_for_workers"] and cf["num_workers"] or 0) def _init(self): - self.local_evaluator = A3CEvaluator( - self.registry, - self.env_creator, - self.config, - self.logdir, - start_sampler=False) - if self.config["use_gpu_for_workers"]: - remote_cls = GPURemoteA3CEvaluator + self.policy_cls = get_policy_cls(self.config) + + if self.config["use_pytorch"]: + session_creator = None else: - remote_cls = RemoteA3CEvaluator + import tensorflow as tf + + def session_creator(): + return tf.Session( + config=tf.ConfigProto( + intra_op_parallelism_threads=1, + inter_op_parallelism_threads=1, + gpu_options=tf.GPUOptions(allow_growth=True))) + + remote_cls = CommonPolicyEvaluator.as_remote( + num_gpus=1 if self.config["use_gpu_for_workers"] else 0) + self.local_evaluator = CommonPolicyEvaluator( + self.env_creator, self.policy_cls, + batch_steps=self.config["batch_size"], + batch_mode="truncate_episodes", + tf_session_creator=session_creator, + registry=self.registry, env_config=self.config["env_config"], + model_config=self.config["model"], policy_config=self.config) self.remote_evaluators = [ - remote_cls.remote(self.registry, self.env_creator, self.config, - self.logdir) - for i in range(self.config["num_workers"]) - ] - self.optimizer = AsyncOptimizer(self.config["optimizer"], - self.local_evaluator, - self.remote_evaluators) + remote_cls.remote( + self.env_creator, self.policy_cls, + batch_steps=self.config["batch_size"], + batch_mode="truncate_episodes", sample_async=True, + tf_session_creator=session_creator, + registry=self.registry, env_config=self.config["env_config"], + model_config=self.config["model"], policy_config=self.config) + for i in range(self.config["num_workers"])] + + self.optimizer = AsyncOptimizer( + self.config["optimizer"], self.local_evaluator, + self.remote_evaluators) def _train(self): self.optimizer.step() - FilterManager.synchronize(self.local_evaluator.filters, - self.remote_evaluators) - res = self._fetch_metrics_from_remote_evaluators() - return res - - def _fetch_metrics_from_remote_evaluators(self): - episode_rewards = [] - episode_lengths = [] - metric_lists = [ - a.get_completed_rollout_metrics.remote() - for a in self.remote_evaluators - ] - for metrics in metric_lists: - for episode in ray.get(metrics): - episode_lengths.append(episode.episode_length) - episode_rewards.append(episode.episode_reward) - avg_reward = (np.mean(episode_rewards) - if episode_rewards else float('nan')) - avg_length = (np.mean(episode_lengths) - if episode_lengths else float('nan')) - timesteps = np.sum(episode_lengths) if episode_lengths else 0 - - result = TrainingResult( - episode_reward_mean=avg_reward, - episode_len_mean=avg_length, - timesteps_this_iter=timesteps, - info={}) - - return result + FilterManager.synchronize( + self.local_evaluator.filters, self.remote_evaluators) + return collect_metrics(self.local_evaluator, self.remote_evaluators) def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 @@ -154,7 +147,10 @@ def _restore(self, checkpoint_path): ]) self.local_evaluator.restore(extra_data["local_state"]) - def compute_action(self, observation): + def compute_action(self, observation, state=None): + if state is None: + state = [] obs = self.local_evaluator.obs_filter(observation, update=False) - action, info = self.local_evaluator.policy.compute(obs) - return action + return self.local_evaluator.for_policy( + lambda p: p.compute_single_action( + obs, state, is_training=False)[0]) diff --git a/python/ray/rllib/a3c/a3c_evaluator.py b/python/ray/rllib/a3c/a3c_evaluator.py deleted file mode 100644 index 74d201016adf..000000000000 --- a/python/ray/rllib/a3c/a3c_evaluator.py +++ /dev/null @@ -1,119 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pickle - -import ray -from ray.rllib.models import ModelCatalog -from ray.rllib.optimizers import PolicyEvaluator -from ray.rllib.a3c.common import get_policy_cls -from ray.rllib.utils.filter import get_filter -from ray.rllib.utils.sampler import AsyncSampler -from ray.rllib.utils.process_rollout import process_rollout - - -class A3CEvaluator(PolicyEvaluator): - """Actor object to start running simulation on workers. - - The gradient computation is also executed from this object. - - Attributes: - policy: Copy of graph used for policy. Used by sampler and gradients. - obs_filter: Observation filter used in environment sampling - rew_filter: Reward filter used in rollout post-processing. - sampler: Component for interacting with environment and generating - rollouts. - logdir: Directory for logging. - """ - def __init__( - self, registry, env_creator, config, logdir, start_sampler=True): - env = ModelCatalog.get_preprocessor_as_wrapper( - registry, env_creator(config["env_config"]), config["model"]) - self.env = env - policy_cls = get_policy_cls(config) - # TODO(rliaw): should change this to be just env.observation_space - self.policy = policy_cls( - registry, env.observation_space.shape, env.action_space, config) - self.config = config - - # Technically not needed when not remote - self.obs_filter = get_filter( - config["observation_filter"], env.observation_space.shape) - self.rew_filter = get_filter(config["reward_filter"], ()) - self.filters = {"obs_filter": self.obs_filter, - "rew_filter": self.rew_filter} - self.sampler = AsyncSampler(env, self.policy, self.obs_filter, - config["batch_size"]) - if start_sampler and self.sampler._async: - self.sampler.start() - self.logdir = logdir - - def sample(self): - rollout = self.sampler.get_data() - samples = process_rollout( - rollout, self.rew_filter, gamma=self.config["gamma"], - lambda_=self.config["lambda"], use_gae=True) - return samples - - def get_completed_rollout_metrics(self): - """Returns metrics on previously completed rollouts. - - Calling this clears the queue of completed rollout metrics. - """ - return self.sampler.get_metrics() - - def compute_gradients(self, samples): - gradient, info = self.policy.compute_gradients(samples) - return gradient, {} - - def apply_gradients(self, grads): - self.policy.apply_gradients(grads) - - def get_weights(self): - return self.policy.get_weights() - - def set_weights(self, params): - self.policy.set_weights(params) - - def save(self): - filters = self.get_filters(flush_after=True) - weights = self.get_weights() - return pickle.dumps({ - "filters": filters, - "weights": weights}) - - def restore(self, objs): - objs = pickle.loads(objs) - self.sync_filters(objs["filters"]) - self.set_weights(objs["weights"]) - - def sync_filters(self, new_filters): - """Changes self's filter to given and rebases any accumulated delta. - - Args: - new_filters (dict): Filters with new state to update local copy. - """ - assert all(k in new_filters for k in self.filters) - for k in self.filters: - self.filters[k].sync(new_filters[k]) - - def get_filters(self, flush_after=False): - """Returns a snapshot of filters. - - Args: - flush_after (bool): Clears the filter buffer state. - - Returns: - return_filters (dict): Dict for serializable filters - """ - return_filters = {} - for k, f in self.filters.items(): - return_filters[k] = f.as_serializable() - if flush_after: - f.clear_buffer() - return return_filters - - -RemoteA3CEvaluator = ray.remote(A3CEvaluator) -GPURemoteA3CEvaluator = ray.remote(num_gpus=1)(A3CEvaluator) diff --git a/python/ray/rllib/a3c/a3c_tf_policy.py b/python/ray/rllib/a3c/a3c_tf_policy.py new file mode 100644 index 000000000000..e2a8da233880 --- /dev/null +++ b/python/ray/rllib/a3c/a3c_tf_policy.py @@ -0,0 +1,103 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import gym + +from ray.rllib.utils.error import UnsupportedSpaceException +from ray.rllib.utils.process_rollout import compute_advantages +from ray.rllib.utils.tf_policy_graph import TFPolicyGraph + + +class A3CTFPolicyGraph(TFPolicyGraph): + """The TF policy base class.""" + + def __init__(self, ob_space, action_space, registry, config): + self.registry = registry + self.local_steps = 0 + self.config = config + self.summarize = config.get("summarize") + + self._setup_graph(ob_space, action_space) + assert all(hasattr(self, attr) + for attr in ["vf", "logits", "x", "var_list"]) + print("Setting up loss") + self.setup_loss(action_space) + self.is_training = tf.placeholder_with_default(True, ()) + self.sess = tf.get_default_session() + + TFPolicyGraph.__init__( + self, self.sess, obs_input=self.x, + action_sampler=self.action_dist.sample(), loss=self.loss, + loss_inputs=self.loss_in, is_training=self.is_training, + state_inputs=self.state_in, state_outputs=self.state_out) + + self.sess.run(tf.global_variables_initializer()) + + if self.summarize: + bs = tf.to_float(tf.shape(self.x)[0]) + tf.summary.scalar("model/policy_graph", self.pi_loss / bs) + tf.summary.scalar("model/value_loss", self.vf_loss / bs) + tf.summary.scalar("model/entropy", self.entropy / bs) + tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads)) + tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list)) + self.summary_op = tf.summary.merge_all() + + def _setup_graph(self, ob_space, ac_space): + raise NotImplementedError + + def setup_loss(self, action_space): + if isinstance(action_space, gym.spaces.Box): + ac_size = action_space.shape[0] + self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac") + elif isinstance(action_space, gym.spaces.Discrete): + self.ac = tf.placeholder(tf.int64, [None], name="ac") + else: + raise UnsupportedSpaceException( + "Action space {} is not supported for A3C.".format( + action_space)) + self.adv = tf.placeholder(tf.float32, [None], name="adv") + self.r = tf.placeholder(tf.float32, [None], name="r") + + log_prob = self.action_dist.logp(self.ac) + + # The "policy gradients" loss: its derivative is precisely the policy + # gradient. Notice that self.ac is a placeholder that is provided + # externally. adv will contain the advantages, as calculated in + # compute_advantages. + self.pi_loss = - tf.reduce_sum(log_prob * self.adv) + + delta = self.vf - self.r + self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) + self.entropy = tf.reduce_sum(self.action_dist.entropy()) + self.loss = (self.pi_loss + + self.vf_loss * self.config["vf_loss_coeff"] + + self.entropy * self.config["entropy_coeff"]) + + def optimizer(self): + return tf.train.AdamOptimizer(self.config["lr"]) + + def gradients(self, optimizer): + grads = tf.gradients(self.loss, self.var_list) + self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"]) + clipped_grads = list(zip(self.grads, self.var_list)) + return clipped_grads + + def extra_compute_grad_fetches(self): + if self.summarize: + return {"summary": self.summary_op} + else: + return {} + + def postprocess_trajectory(self, sample_batch, other_agent_batches=None): + completed = sample_batch["dones"][-1] + if completed: + last_r = 0.0 + else: + next_state = [] + for i in range(len(self.state_in)): + next_state.append([sample_batch["state_out_{}".format(i)][-1]]) + last_r = self.value(sample_batch["new_obs"][-1], *next_state) + return compute_advantages( + sample_batch, last_r, self.config["gamma"], self.config["lambda"]) diff --git a/python/ray/rllib/a3c/a3c_torch_policy.py b/python/ray/rllib/a3c/a3c_torch_policy.py new file mode 100644 index 000000000000..786a21553a49 --- /dev/null +++ b/python/ray/rllib/a3c/a3c_torch_policy.py @@ -0,0 +1,113 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from threading import Lock + +import torch +import torch.nn.functional as F + +from ray.rllib.models.pytorch.misc import var_to_np, convert_batch +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.utils.process_rollout import compute_advantages +from ray.rllib.utils.policy_graph import PolicyGraph + + +class SharedTorchPolicy(PolicyGraph): + """A simple, non-recurrent PyTorch policy example.""" + + def __init__(self, obs_space, action_space, registry, config): + self.registry = registry + self.local_steps = 0 + self.config = config + self.summarize = config.get("summarize") + self.setup_graph(obs_space, action_space) + torch.set_num_threads(2) + self.lock = Lock() + + def setup_graph(self, obs_space, action_space): + _, self.logit_dim = ModelCatalog.get_action_dist(action_space) + self._model = ModelCatalog.get_torch_model( + self.registry, obs_space.shape, self.logit_dim, + self.config["model"]) + self.optimizer = torch.optim.Adam( + self._model.parameters(), lr=self.config["lr"]) + + def compute_single_action(self, obs, state, is_training=False): + assert not state, "RNN not supported" + with self.lock: + ob = torch.from_numpy(obs).float().unsqueeze(0) + logits, values = self._model(ob) + samples = F.softmax(logits, dim=1).multinomial(1).squeeze() + values = values.squeeze() + return var_to_np(samples), [], {"vf_preds": var_to_np(values)} + + def compute_gradients(self, samples): + with self.lock: + self.backward(samples) + # Note that return values are just references; + # calling zero_grad will modify the values + return [p.grad.data.numpy() for p in self._model.parameters()], {} + + def apply_gradients(self, grads): + self.optimizer.zero_grad() + for g, p in zip(grads, self._model.parameters()): + p.grad = torch.from_numpy(g) + self.optimizer.step() + return {} + + def get_weights(self): + # !! This only returns references to the data. + return self._model.state_dict() + + def set_weights(self, weights): + with self.lock: + self._model.load_state_dict(weights) + + def value(self, obs): + with self.lock: + obs = torch.from_numpy(obs).float().unsqueeze(0) + res = self._model.hidden_layers(obs) + res = self._model.value_branch(res) + res = res.squeeze() + return var_to_np(res) + + def forward(self, obs_batch, actions): + logits, values = self._model(obs_batch) + log_probs = F.log_softmax(logits, dim=1) + probs = F.softmax(logits, dim=1) + action_log_probs = log_probs.gather(1, actions.view(-1, 1)) + entropy = -(log_probs * probs).sum(-1).sum() + return values, action_log_probs, entropy + + def backward(self, sample_batch): + """Loss is encoded here. + + Defining a new loss function would start by rewriting this function. + """ + + states, actions, advs, rs = convert_batch(sample_batch) + values, action_log_probs, entropy = self.forward(states, actions) + pi_err = -advs.dot(action_log_probs.reshape(-1)) + value_err = F.mse_loss(values.reshape(-1), rs) + + self.optimizer.zero_grad() + + overall_err = sum([ + pi_err, + self.config["vf_loss_coeff"] * value_err, + self.config["entropy_coeff"] * entropy, + ]) + + overall_err.backward() + torch.nn.utils.clip_grad_norm_(self._model.parameters(), + self.config["grad_clip"]) + + def postprocess_trajectory(self, sample_batch, other_agent_batches=None): + completed = sample_batch["dones"][-1] + if completed: + last_r = 0.0 + else: + last_r = self.value(sample_batch["new_obs"][-1]) + return compute_advantages( + sample_batch, last_r, self.config["gamma"], self.config["lambda"]) diff --git a/python/ray/rllib/a3c/common.py b/python/ray/rllib/a3c/common.py index da29eb452f02..cc2179c2f6ff 100644 --- a/python/ray/rllib/a3c/common.py +++ b/python/ray/rllib/a3c/common.py @@ -8,7 +8,7 @@ def get_policy_cls(config): from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM policy_cls = SharedModelLSTM elif config["use_pytorch"]: - from ray.rllib.a3c.shared_torch_policy import SharedTorchPolicy + from ray.rllib.a3c.a3c_torch_policy import SharedTorchPolicy policy_cls = SharedTorchPolicy else: from ray.rllib.a3c.shared_model import SharedModel diff --git a/python/ray/rllib/a3c/policy.py b/python/ray/rllib/a3c/policy.py deleted file mode 100644 index 1e9639fd71af..000000000000 --- a/python/ray/rllib/a3c/policy.py +++ /dev/null @@ -1,28 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -class Policy(object): - """The policy base class.""" - def __init__(self, ob_space, action_space, name="local", summarize=True): - pass - - def apply_gradients(self, grads): - raise NotImplementedError - - def get_weights(self): - raise NotImplementedError - - def set_weights(self, weights): - raise NotImplementedError - - def compute_gradients(self, samples): - raise NotImplementedError - - def compute(self, observations): - """Compute action for a _single_ observation""" - raise NotImplementedError - - def value(self, ob): - raise NotImplementedError diff --git a/python/ray/rllib/a3c/shared_model.py b/python/ray/rllib/a3c/shared_model.py index 8209be159ed4..3a093fa906f8 100644 --- a/python/ray/rllib/a3c/shared_model.py +++ b/python/ray/rllib/a3c/shared_model.py @@ -4,30 +4,27 @@ import tensorflow as tf from ray.rllib.models.misc import linear, normc_initializer -from ray.rllib.a3c.tfpolicy import TFPolicy +from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph from ray.rllib.models.catalog import ModelCatalog -class SharedModel(TFPolicy): +class SharedModel(A3CTFPolicyGraph): - other_output = ["vf_preds"] - is_recurrent = False - - def __init__(self, registry, ob_space, ac_space, config, **kwargs): + def __init__(self, ob_space, ac_space, registry, config, **kwargs): super(SharedModel, self).__init__( - registry, ob_space, ac_space, config, **kwargs) + ob_space, ac_space, registry, config, **kwargs) def _setup_graph(self, ob_space, ac_space): - self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) + self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_model( self.registry, self.x, self.logit_dim, self.config["model"]) self.logits = self._model.outputs - self.curr_dist = dist_class(self.logits) + self.action_dist = dist_class(self.logits) self.vf = tf.reshape(linear(self._model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) - self.sample = self.curr_dist.sample() + self.sample = self.action_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.global_step = tf.get_variable( @@ -35,28 +32,20 @@ def _setup_graph(self, ob_space, ac_space): initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) - def compute_gradients(self, samples): - info = {} - feed_dict = { - self.x: samples["obs"], - self.ac: samples["actions"], - self.adv: samples["advantages"], - self.r: samples["value_targets"], - } - self.grads = [g for g in self.grads if g is not None] - self.local_steps += 1 - if self.summarize: - grad, summ = self.sess.run([self.grads, self.summary_op], - feed_dict=feed_dict) - info['summary'] = summ - else: - grad = self.sess.run(self.grads, feed_dict=feed_dict) - return grad, info - - def compute(self, ob, *args): - action, vf = self.sess.run([self.sample, self.vf], - {self.x: [ob]}) - return action[0], {"vf_preds": vf[0]} + self.state_in = [] + self.state_out = [] + + def setup_loss(self, action_space): + A3CTFPolicyGraph.setup_loss(self, action_space) + self.loss_in = [ + ("obs", self.x), + ("actions", self.ac), + ("advantages", self.adv), + ("value_targets", self.r), + ] + + def extra_compute_action_fetches(self): + return {"vf_preds": self.vf} def value(self, ob, *args): vf = self.sess.run(self.vf, {self.x: [ob]}) diff --git a/python/ray/rllib/a3c/shared_model_lstm.py b/python/ray/rllib/a3c/shared_model_lstm.py index 37f71e490467..7cb64e684aa6 100644 --- a/python/ray/rllib/a3c/shared_model_lstm.py +++ b/python/ray/rllib/a3c/shared_model_lstm.py @@ -5,43 +5,32 @@ import tensorflow as tf from ray.rllib.models.misc import linear, normc_initializer from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.a3c.tfpolicy import TFPolicy +from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph from ray.rllib.models.lstm import LSTM -class SharedModelLSTM(TFPolicy): - """ - Attributes: - other_output (list): Other than `action`, the other return values from - `compute_gradients`. - is_recurrent (bool): True if is a recurrent network (requires features - to be tracked). - """ +class SharedModelLSTM(A3CTFPolicyGraph): - other_output = ["vf_preds", "features"] - is_recurrent = True - - def __init__(self, registry, ob_space, ac_space, config, **kwargs): + def __init__(self, ob_space, ac_space, registry, config, **kwargs): super(SharedModelLSTM, self).__init__( - registry, ob_space, ac_space, config, **kwargs) + ob_space, ac_space, registry, config, **kwargs) def _setup_graph(self, ob_space, ac_space): - self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) + self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = LSTM(self.x, self.logit_dim, {}) - self.state_init = self._model.state_init self.state_in = self._model.state_in self.state_out = self._model.state_out self.logits = self._model.outputs - self.curr_dist = dist_class(self.logits) + self.action_dist = dist_class(self.logits) # with tf.variable_scope("vf"): # vf_model = ModelCatalog.get_model(self.x, 1) self.vf = tf.reshape(linear(self._model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) - self.sample = self.curr_dist.sample() + self.sample = self.action_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.global_step = tf.get_variable( @@ -49,42 +38,25 @@ def _setup_graph(self, ob_space, ac_space): initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) - def compute_gradients(self, samples): - """Computing the gradient is actually model-dependent. + def get_initial_state(self): + return self._model.state_init - The LSTM needs its hidden states in order to compute the gradient - accurately. - """ - features = samples["features"][0] - feed_dict = { - self.x: samples["obs"], - self.ac: samples["actions"], - self.adv: samples["advantages"], - self.r: samples["value_targets"], - self.state_in[0]: features[0], - self.state_in[1]: features[1] - } - info = {} - self.local_steps += 1 - if self.summarize and self.local_steps % 10 == 0: - grad, summ = self.sess.run([self.grads, self.summary_op], - feed_dict=feed_dict) - info['summary'] = summ - else: - grad = self.sess.run(self.grads, feed_dict=feed_dict) - return grad, info + def setup_loss(self, action_space): + A3CTFPolicyGraph.setup_loss(self, action_space) + self.loss_in = [ + ("obs", self.x), + ("actions", self.ac), + ("advantages", self.adv), + ("value_targets", self.r), + ("state_in_0", self.state_in[0]), + ("state_in_1", self.state_in[1]), + ] - def compute(self, ob, c, h): - action, vf, c, h = self.sess.run( - [self.sample, self.vf] + self.state_out, - {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h}) - return action[0], {"vf_preds": vf[0], "features": (c, h)} + def extra_compute_action_fetches(self): + return {"vf_preds": self.vf} def value(self, ob, c, h): vf = self.sess.run(self.vf, {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h}) return vf[0] - - def get_initial_features(self): - return self.state_init diff --git a/python/ray/rllib/a3c/tfpolicy.py b/python/ray/rllib/a3c/tfpolicy.py deleted file mode 100644 index 1fbb46bdfe78..000000000000 --- a/python/ray/rllib/a3c/tfpolicy.py +++ /dev/null @@ -1,106 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -import ray -import gym -from ray.rllib.a3c.policy import Policy - - -class TFPolicy(Policy): - """The policy base class.""" - def __init__(self, registry, ob_space, action_space, config, - name="local", summarize=True): - self.registry = registry - self.local_steps = 0 - self.config = config - self.summarize = summarize - worker_device = "/job:localhost/replica:0/task:0/cpu:0" - self.g = tf.Graph() - with self.g.as_default(), tf.device(worker_device): - with tf.variable_scope(name): - self._setup_graph(ob_space, action_space) - assert all(hasattr(self, attr) - for attr in ["vf", "logits", "x", "var_list"]) - print("Setting up loss") - self.setup_loss(action_space) - self.setup_gradients() - self.initialize() - - def _setup_graph(self, ob_space, ac_space): - raise NotImplementedError - - def setup_loss(self, action_space): - if isinstance(action_space, gym.spaces.Box): - ac_size = action_space.shape[0] - self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac") - elif isinstance(action_space, gym.spaces.Discrete): - self.ac = tf.placeholder(tf.int64, [None], name="ac") - else: - raise NotImplementedError( - "action space" + str(type(action_space)) + - "currently not supported") - self.adv = tf.placeholder(tf.float32, [None], name="adv") - self.r = tf.placeholder(tf.float32, [None], name="r") - - log_prob = self.curr_dist.logp(self.ac) - - # The "policy gradients" loss: its derivative is precisely the policy - # gradient. Notice that self.ac is a placeholder that is provided - # externally. adv will contain the advantages, as calculated in - # process_rollout. - self.pi_loss = - tf.reduce_sum(log_prob * self.adv) - - delta = self.vf - self.r - self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) - self.entropy = tf.reduce_sum(self.curr_dist.entropy()) - self.loss = (self.pi_loss + - self.vf_loss * self.config["vf_loss_coeff"] + - self.entropy * self.config["entropy_coeff"]) - - def setup_gradients(self): - grads = tf.gradients(self.loss, self.var_list) - self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"]) - grads_and_vars = list(zip(self.grads, self.var_list)) - opt = tf.train.AdamOptimizer(self.config["lr"]) - self._apply_gradients = opt.apply_gradients(grads_and_vars) - - def initialize(self): - if self.summarize: - bs = tf.to_float(tf.shape(self.x)[0]) - tf.summary.scalar("model/policy_loss", self.pi_loss / bs) - tf.summary.scalar("model/value_loss", self.vf_loss / bs) - tf.summary.scalar("model/entropy", self.entropy / bs) - tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads)) - tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list)) - self.summary_op = tf.summary.merge_all() - - # TODO(rliaw): Can consider exposing these parameters - self.sess = tf.Session(graph=self.g, config=tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=2, - gpu_options=tf.GPUOptions(allow_growth=True))) - self.variables = ray.experimental.TensorFlowVariables(self.loss, - self.sess) - self.sess.run(tf.global_variables_initializer()) - - def apply_gradients(self, grads): - feed_dict = {self.grads[i]: grads[i] - for i in range(len(grads))} - self.sess.run(self._apply_gradients, feed_dict=feed_dict) - - def get_weights(self): - weights = self.variables.get_weights() - return weights - - def set_weights(self, weights): - self.variables.set_weights(weights) - - def compute_gradients(self, samples): - raise NotImplementedError - - def compute(self, observation): - raise NotImplementedError - - def value(self, ob): - raise NotImplementedError diff --git a/python/ray/rllib/agent.py b/python/ray/rllib/agent.py index 5699022b2a8e..4c17de01fa43 100644 --- a/python/ray/rllib/agent.py +++ b/python/ray/rllib/agent.py @@ -61,7 +61,7 @@ class Agent(Trainable): """ _allow_unknown_configs = False - _allow_unknown_subkeys = [] + _allow_unknown_subkeys = ["env_config", "model", "optimizer"] @classmethod def resource_help(cls, config): diff --git a/python/ray/rllib/bc/bc_evaluator.py b/python/ray/rllib/bc/bc_evaluator.py index 8499ba1e023e..27e011ac6cc4 100644 --- a/python/ray/rllib/bc/bc_evaluator.py +++ b/python/ray/rllib/bc/bc_evaluator.py @@ -17,8 +17,7 @@ def __init__(self, registry, env_creator, config, logdir): env = ModelCatalog.get_preprocessor_as_wrapper(registry, env_creator( config["env_config"]), config["model"]) self.dataset = ExperienceDataset(config["dataset_path"]) - # TODO(rliaw): should change this to be just env.observation_space - self.policy = BCPolicy(registry, env.observation_space.shape, + self.policy = BCPolicy(registry, env.observation_space, env.action_space, config) self.config = config self.logdir = logdir diff --git a/python/ray/rllib/bc/policy.py b/python/ray/rllib/bc/policy.py index 11178a50d23a..2c4210a57cf5 100644 --- a/python/ray/rllib/bc/policy.py +++ b/python/ray/rllib/bc/policy.py @@ -6,30 +6,22 @@ import gym import ray -from ray.rllib.a3c.policy import Policy from ray.rllib.models.catalog import ModelCatalog -class BCPolicy(Policy): - def __init__(self, registry, ob_space, action_space, config, name="local", - summarize=True): - super(BCPolicy, self).__init__(ob_space, action_space, name, summarize) +class BCPolicy(object): + def __init__(self, registry, obs_space, action_space, config): self.registry = registry self.local_steps = 0 self.config = config - self.summarize = summarize - worker_device = "/job:localhost/replica:0/task:0/cpu:0" - self.g = tf.Graph() - with self.g.as_default(), tf.device(worker_device): - with tf.variable_scope(name): - self._setup_graph(ob_space, action_space) - print("Setting up loss") - self.setup_loss(action_space) - self.setup_gradients() - self.initialize() + self.summarize = config.get("summarize") + self._setup_graph(obs_space, action_space) + self.setup_loss(action_space) + self.setup_gradients() + self.initialize() - def _setup_graph(self, ob_space, ac_space): - self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) + def _setup_graph(self, obs_space, ac_space): + self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_model( self.registry, self.x, self.logit_dim, self.config["model"]) diff --git a/python/ray/rllib/ddpg/apex.py b/python/ray/rllib/ddpg/apex.py index d3402e61bc06..1799e71a8d9b 100644 --- a/python/ray/rllib/ddpg/apex.py +++ b/python/ray/rllib/ddpg/apex.py @@ -8,25 +8,25 @@ APEX_DDPG_DEFAULT_CONFIG = merge_dicts( DDPG_CONFIG, { - 'optimizer_class': 'ApexOptimizer', - 'optimizer_config': + "optimizer_class": "ApexOptimizer", + "optimizer_config": merge_dicts( - DDPG_CONFIG['optimizer_config'], { - 'max_weight_sync_delay': 400, - 'num_replay_buffer_shards': 4, - 'debug': False + DDPG_CONFIG["optimizer_config"], { + "max_weight_sync_delay": 400, + "num_replay_buffer_shards": 4, + "debug": False }), - 'n_step': 3, - 'num_workers': 32, - 'buffer_size': 2000000, - 'learning_starts': 50000, - 'train_batch_size': 512, - 'sample_batch_size': 50, - 'max_weight_sync_delay': 400, - 'target_network_update_freq': 500000, - 'timesteps_per_iteration': 25000, - 'per_worker_exploration': True, - 'worker_side_prioritization': True, + "n_step": 3, + "num_workers": 32, + "buffer_size": 2000000, + "learning_starts": 50000, + "train_batch_size": 512, + "sample_batch_size": 50, + "max_weight_sync_delay": 400, + "target_network_update_freq": 500000, + "timesteps_per_iteration": 25000, + "per_worker_exploration": True, + "worker_side_prioritization": True, }, ) diff --git a/python/ray/rllib/ddpg/ddpg.py b/python/ray/rllib/ddpg/ddpg.py index 25fda8321e20..06f6128e81f7 100644 --- a/python/ray/rllib/ddpg/ddpg.py +++ b/python/ray/rllib/ddpg/ddpg.py @@ -2,17 +2,9 @@ from __future__ import division from __future__ import print_function -import pickle -import os - -import numpy as np -import tensorflow as tf - -import ray -from ray.rllib import optimizers -from ray.rllib.ddpg.ddpg_evaluator import DDPGEvaluator -from ray.rllib.agent import Agent -from ray.tune.result import TrainingResult +from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule +from ray.rllib.dqn.dqn import DQNAgent +from ray.rllib.ddpg.ddpg_policy_graph import DDPGPolicyGraph OPTIMIZER_SHARED_CONFIGS = [ "buffer_size", "prioritized_replay", "prioritized_replay_alpha", @@ -23,247 +15,120 @@ DEFAULT_CONFIG = { # === Model === # Hidden layer sizes of the policy networks - 'actor_hiddens': [64, 64], + "actor_hiddens": [64, 64], # Hidden layer sizes of the policy networks - 'critic_hiddens': [64, 64], + "critic_hiddens": [64, 64], # N-step Q learning - 'n_step': 1, + "n_step": 1, # Config options to pass to the model constructor - 'model': {}, + "model": {}, # Discount factor for the MDP - 'gamma': 0.99, + "gamma": 0.99, # Arguments to pass to the env creator - 'env_config': {}, + "env_config": {}, # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction - 'schedule_max_timesteps': 100000, + "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning - 'timesteps_per_iteration': 1000, + "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed - 'exploration_fraction': 0.1, + "exploration_fraction": 0.1, # Final value of random action probability - 'exploration_final_eps': 0.02, + "exploration_final_eps": 0.02, # OU-noise scale - 'noise_scale': 0.1, + "noise_scale": 0.1, # theta - 'exploration_theta': 0.15, + "exploration_theta": 0.15, # sigma - 'exploration_sigma': 0.2, + "exploration_sigma": 0.2, # Update the target network every `target_network_update_freq` steps. - 'target_network_update_freq': 0, + "target_network_update_freq": 0, # Update the target by \tau * policy + (1-\tau) * target_policy - 'tau': 0.002, - # Whether to start with random actions instead of noops. - 'random_starts': True, + "tau": 0.002, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. - 'buffer_size': 50000, + "buffer_size": 50000, # If True prioritized replay buffer will be used. - 'prioritized_replay': True, + "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. - 'prioritized_replay_alpha': 0.6, + "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. - 'prioritized_replay_beta': 0.4, + "prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. - 'prioritized_replay_eps': 1e-6, + "prioritized_replay_eps": 1e-6, # Whether to clip rewards to [-1, 1] prior to adding to the replay buffer. - 'clip_rewards': True, + "clip_rewards": True, # === Optimization === # Learning rate for adam optimizer - 'actor_lr': 1e-4, - 'critic_lr': 1e-3, + "actor_lr": 1e-4, + "critic_lr": 1e-3, # If True, use huber loss instead of squared loss for critic network # Conventionally, no need to clip gradients if using a huber loss - 'use_huber': False, + "use_huber": False, # Threshold of a huber loss - 'huber_threshold': 1.0, + "huber_threshold": 1.0, # Weights for L2 regularization - 'l2_reg': 1e-6, + "l2_reg": 1e-6, # If not None, clip gradients during optimization at this value - 'grad_norm_clipping': None, + "grad_norm_clipping": None, # How many steps of the model to sample before learning starts. - 'learning_starts': 1500, + "learning_starts": 1500, # Update the replay buffer with this many samples at once. Note that this # setting applies per-worker if num_workers > 1. - 'sample_batch_size': 1, + "sample_batch_size": 1, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. - 'train_batch_size': 256, - # Smooth the current average reward over this many previous episodes. - 'smoothing_num_episodes': 100, - - # === Tensorflow === - # Arguments to pass to tensorflow - 'tf_session_args': { - "device_count": { - "CPU": 2 - }, - "log_device_placement": False, - "allow_soft_placement": True, - "gpu_options": { - "allow_growth": True - }, - "inter_op_parallelism_threads": 1, - "intra_op_parallelism_threads": 1, - }, + "train_batch_size": 256, # === Parallelism === + # Whether to use a GPU for local optimization. + "gpu": False, # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if - # you're using the Async or Ape-X optimizers. - 'num_workers': 0, + # you"re using the Async or Ape-X optimizers. + "num_workers": 0, # Whether to allocate GPUs for workers (if > 0). - 'num_gpus_per_worker': 0, + "num_gpus_per_worker": 0, + # Whether to allocate CPUs for workers (if > 0). + "num_cpus_per_worker": 1, # Optimizer class to use. - 'optimizer_class': "LocalSyncReplayOptimizer", + "optimizer_class": "LocalSyncReplayOptimizer", # Config to pass to the optimizer. - 'optimizer_config': {}, + "optimizer_config": {}, # Whether to use a distribution of epsilons across workers for exploration. - 'per_worker_exploration': False, + "per_worker_exploration": False, # Whether to compute priorities on workers. - 'worker_side_prioritization': False + "worker_side_prioritization": False } -class DDPGAgent(Agent): +class DDPGAgent(DQNAgent): _agent_name = "DDPG" _allow_unknown_subkeys = [ - "model", "optimizer", "tf_session_args", "env_config" - ] + "model", "optimizer", "tf_session_args", "env_config"] _default_config = DEFAULT_CONFIG + _policy_graph = DDPGPolicyGraph - def _init(self): - self.local_evaluator = DDPGEvaluator(self.registry, self.env_creator, - self.config, self.logdir, 0) - remote_cls = ray.remote( - num_cpus=1, - num_gpus=self.config["num_gpus_per_worker"])(DDPGEvaluator) - self.remote_evaluators = [ - remote_cls.remote(self.registry, self.env_creator, self.config, - self.logdir, i) - for i in range(self.config["num_workers"]) - ] - - for k in OPTIMIZER_SHARED_CONFIGS: - if k not in self.config["optimizer_config"]: - self.config["optimizer_config"][k] = self.config[k] - - self.optimizer = getattr(optimizers, self.config["optimizer_class"])( - self.config["optimizer_config"], self.local_evaluator, - self.remote_evaluators) - - self.saver = tf.train.Saver(max_to_keep=None) - self.last_target_update_ts = 0 - self.num_target_updates = 0 - - @property - def global_timestep(self): - return self.optimizer.num_steps_sampled - - def update_target_if_needed(self): - if self.global_timestep - self.last_target_update_ts > \ - self.config["target_network_update_freq"]: - self.local_evaluator.update_target() - self.last_target_update_ts = self.global_timestep - self.num_target_updates += 1 - - def _train(self): - start_timestep = self.global_timestep - - while (self.global_timestep - start_timestep < - self.config["timesteps_per_iteration"]): - - self.optimizer.step() - self.update_target_if_needed() - - self.local_evaluator.set_global_timestep(self.global_timestep) - for e in self.remote_evaluators: - e.set_global_timestep.remote(self.global_timestep) - - return self._train_stats(start_timestep) - - def _train_stats(self, start_timestep): - if self.remote_evaluators: - stats = ray.get([e.stats.remote() for e in self.remote_evaluators]) - else: - stats = self.local_evaluator.stats() - if not isinstance(stats, list): - stats = [stats] - - mean_100ep_reward = 0.0 - mean_100ep_length = 0.0 - num_episodes = 0 - explorations = [] - + def _make_exploration_schedule(self, worker_index): + # Override DQN's schedule to take into account `noise_scale` if self.config["per_worker_exploration"]: - # Return stats from workers with the lowest 20% of exploration - test_stats = stats[-int(max(1, len(stats) * 0.2)):] + assert self.config["num_workers"] > 1, \ + "This requires multiple workers" + return ConstantSchedule( + self.config["noise_scale"] * 0.4 ** + (1 + worker_index / float(self.config["num_workers"] - 1) * 7)) else: - test_stats = stats - - for s in test_stats: - mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats) - mean_100ep_length += s["mean_100ep_length"] / len(test_stats) - - for s in stats: - num_episodes += s["num_episodes"] - explorations.append(s["exploration"]) - - opt_stats = self.optimizer.stats() - - result = TrainingResult( - episode_reward_mean=mean_100ep_reward, - episode_len_mean=mean_100ep_length, - episodes_total=num_episodes, - timesteps_this_iter=self.global_timestep - start_timestep, - info=dict({ - "min_exploration": min(explorations), - "max_exploration": max(explorations), - "num_target_updates": self.num_target_updates, - }, **opt_stats)) - - return result - - def _stop(self): - # workaround for https://github.com/ray-project/ray/issues/1516 - for ev in self.remote_evaluators: - ev.__ray_terminate__.remote() - - def _save(self, checkpoint_dir): - checkpoint_path = self.saver.save( - self.local_evaluator.sess, - os.path.join(checkpoint_dir, "checkpoint"), - global_step=self.iteration) - extra_data = [ - self.local_evaluator.save(), - ray.get([e.save.remote() for e in self.remote_evaluators]), - self.optimizer.save(), self.num_target_updates, - self.last_target_update_ts - ] - pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb")) - return checkpoint_path - - def _restore(self, checkpoint_path): - self.saver.restore(self.local_evaluator.sess, checkpoint_path) - extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb")) - self.local_evaluator.restore(extra_data[0]) - ray.get([ - e.restore.remote(d) - for (d, e) in zip(extra_data[1], self.remote_evaluators) - ]) - self.optimizer.restore(extra_data[2]) - self.num_target_updates = extra_data[3] - self.last_target_update_ts = extra_data[4] - - def compute_action(self, observation): - return self.local_evaluator.ddpg_graph.act(self.local_evaluator.sess, - np.array(observation)[None], - 0.0)[0] + return LinearSchedule( + schedule_timesteps=int(self.config["exploration_fraction"] * + self.config["schedule_max_timesteps"]), + initial_p=self.config["noise_scale"] * 1.0, + final_p=self.config["noise_scale"] * + self.config["exploration_final_eps"]) diff --git a/python/ray/rllib/ddpg/ddpg_evaluator.py b/python/ray/rllib/ddpg/ddpg_evaluator.py deleted file mode 100644 index 5a68c4b583ee..000000000000 --- a/python/ray/rllib/ddpg/ddpg_evaluator.py +++ /dev/null @@ -1,186 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from gym.spaces import Box -import numpy as np -import tensorflow as tf - -import ray -from ray.rllib.utils.error import UnsupportedSpaceException -from ray.rllib.ddpg import models -from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule -from ray.rllib.optimizers import SampleBatch, PolicyEvaluator -from ray.rllib.utils.compression import pack -from ray.rllib.dqn.dqn_evaluator import adjust_nstep -from ray.rllib.dqn.common.wrappers import wrap_dqn - - -class DDPGEvaluator(PolicyEvaluator): - """The base DDPG Evaluator.""" - - def __init__(self, registry, env_creator, config, logdir, worker_index): - env = env_creator(config["env_config"]) - env = wrap_dqn(registry, env, config["model"], config["random_starts"]) - self.env = env - self.config = config - - # when env.action_space is of Box type, e.g., Pendulum-v0 - # action_space.low is [-2.0], high is [2.0] - # take action by calling, e.g., env.step([3.5]) - if not isinstance(env.action_space, Box): - raise UnsupportedSpaceException( - "Action space {} is not supported for DDPG.".format( - env.action_space)) - - tf_config = tf.ConfigProto(**config["tf_session_args"]) - self.sess = tf.Session(config=tf_config) - self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir) - - # Use either a different `eps` per worker, or a linear schedule. - if config["per_worker_exploration"]: - assert config["num_workers"] > 1, "This requires multiple workers" - self.exploration = ConstantSchedule( - config["noise_scale"] * 0.4 ** - (1 + worker_index / float(config["num_workers"] - 1) * 7)) - else: - self.exploration = LinearSchedule( - schedule_timesteps=int(config["exploration_fraction"] * - config["schedule_max_timesteps"]), - initial_p=config["noise_scale"] * 1.0, - final_p=config["noise_scale"] * - config["exploration_final_eps"]) - - # Initialize the parameters and copy them to the target network. - self.sess.run(tf.global_variables_initializer()) - # hard instead of soft - self.ddpg_graph.update_target(self.sess, 1.0) - self.global_timestep = 0 - self.local_timestep = 0 - - # Note that this encompasses both the policy and Q-value networks and - # their corresponding target networks - self.variables = ray.experimental.TensorFlowVariables( - tf.group(self.ddpg_graph.q_tp0, self.ddpg_graph.q_tp1), self.sess) - - self.episode_rewards = [0.0] - self.episode_lengths = [0.0] - self.saved_mean_reward = None - - self.obs = self.env.reset() - - def set_global_timestep(self, global_timestep): - self.global_timestep = global_timestep - - def update_target(self): - self.ddpg_graph.update_target(self.sess) - - def sample(self): - obs, actions, rewards, new_obs, dones = [], [], [], [], [] - for _ in range( - self.config["sample_batch_size"] + self.config["n_step"] - 1): - ob, act, rew, ob1, done = self._step(self.global_timestep) - obs.append(ob) - actions.append(act) - rewards.append(rew) - new_obs.append(ob1) - dones.append(done) - - # N-step Q adjustments - if self.config["n_step"] > 1: - # Adjust for steps lost from truncation - self.local_timestep -= (self.config["n_step"] - 1) - adjust_nstep(self.config["n_step"], self.config["gamma"], obs, - actions, rewards, new_obs, dones) - - batch = SampleBatch({ - "obs": [pack(np.array(o)) for o in obs], - "actions": actions, - "rewards": rewards, - "new_obs": [pack(np.array(o)) for o in new_obs], - "dones": dones, - "weights": np.ones_like(rewards) - }) - assert (batch.count == self.config["sample_batch_size"]) - - # Prioritize on the worker side - if self.config["worker_side_prioritization"]: - td_errors = self.ddpg_graph.compute_td_error( - self.sess, obs, batch["actions"], batch["rewards"], new_obs, - batch["dones"], batch["weights"]) - new_priorities = ( - np.abs(td_errors) + self.config["prioritized_replay_eps"]) - batch.data["weights"] = new_priorities - - return batch - - def compute_gradients(self, samples): - td_err, grads = self.ddpg_graph.compute_gradients( - self.sess, samples["obs"], samples["actions"], samples["rewards"], - samples["new_obs"], samples["dones"], samples["weights"]) - return grads, {"td_error": td_err} - - def apply_gradients(self, grads): - self.ddpg_graph.apply_gradients(self.sess, grads) - - def compute_apply(self, samples): - td_error = self.ddpg_graph.compute_apply( - self.sess, samples["obs"], samples["actions"], samples["rewards"], - samples["new_obs"], samples["dones"], samples["weights"]) - return {"td_error": td_error} - - def get_weights(self): - return self.variables.get_weights() - - def set_weights(self, weights): - self.variables.set_weights(weights) - - def _step(self, global_timestep): - """Takes a single step, and returns the result of the step.""" - action = self.ddpg_graph.act( - self.sess, - np.array(self.obs)[None], - self.exploration.value(global_timestep))[0] - new_obs, rew, done, _ = self.env.step(action) - ret = (self.obs, action, rew, new_obs, float(done)) - self.obs = new_obs - self.episode_rewards[-1] += rew - self.episode_lengths[-1] += 1 - if done: - self.obs = self.env.reset() - self.episode_rewards.append(0.0) - self.episode_lengths.append(0.0) - # reset UO noise for each episode - self.ddpg_graph.reset_noise(self.sess) - - self.local_timestep += 1 - return ret - - def stats(self): - n = self.config["smoothing_num_episodes"] + 1 - mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5) - mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5) - exploration = self.exploration.value(self.global_timestep) - return { - "mean_100ep_reward": mean_100ep_reward, - "mean_100ep_length": mean_100ep_length, - "num_episodes": len(self.episode_rewards), - "exploration": exploration, - "local_timestep": self.local_timestep, - } - - def save(self): - return [ - self.exploration, self.episode_rewards, self.episode_lengths, - self.saved_mean_reward, self.obs, self.global_timestep, - self.local_timestep - ] - - def restore(self, data): - self.exploration = data[0] - self.episode_rewards = data[1] - self.episode_lengths = data[2] - self.saved_mean_reward = data[3] - self.obs = data[4] - self.global_timestep = data[5] - self.local_timestep = data[6] diff --git a/python/ray/rllib/ddpg/ddpg_policy_graph.py b/python/ray/rllib/ddpg/ddpg_policy_graph.py new file mode 100644 index 000000000000..51572659b4e9 --- /dev/null +++ b/python/ray/rllib/ddpg/ddpg_policy_graph.py @@ -0,0 +1,327 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from gym.spaces import Box +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers + +import ray +from ray.rllib.dqn.dqn_policy_graph import _huber_loss, _minimize_and_clip, \ + _scope_vars, _postprocess_dqn +from ray.rllib.models import ModelCatalog +from ray.rllib.utils.error import UnsupportedSpaceException +from ray.rllib.utils.tf_policy_graph import TFPolicyGraph + + +A_SCOPE = "a_func" +P_SCOPE = "p_func" +P_TARGET_SCOPE = "target_p_func" +Q_SCOPE = "q_func" +Q_TARGET_SCOPE = "target_q_func" + + +def _build_p_network(registry, inputs, dim_actions, config): + """ + map an observation (i.e., state) to an action where + each entry takes value from (0, 1) due to the sigmoid function + """ + frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"]) + + hiddens = config["actor_hiddens"] + action_out = frontend.last_layer + for hidden in hiddens: + action_out = layers.fully_connected( + action_out, num_outputs=hidden, activation_fn=tf.nn.relu) + # Use sigmoid layer to bound values within (0, 1) + # shape of action_scores is [batch_size, dim_actions] + action_scores = layers.fully_connected( + action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid) + + return action_scores + + +# As a stochastic policy for inference, but a deterministic policy for training +# thus ignore batch_size issue when constructing a stochastic action +def _build_action_network(p_values, low_action, high_action, stochastic, eps, + theta, sigma): + # shape is [None, dim_action] + deterministic_actions = (high_action - low_action) * p_values + low_action + + exploration_sample = tf.get_variable( + name="ornstein_uhlenbeck", + dtype=tf.float32, + initializer=low_action.size * [.0], + trainable=False) + normal_sample = tf.random_normal( + shape=[low_action.size], mean=0.0, stddev=1.0) + exploration_value = tf.assign_add( + exploration_sample, + theta * (.0 - exploration_sample) + sigma * normal_sample) + stochastic_actions = deterministic_actions + eps * ( + high_action - low_action) * exploration_value + + return tf.cond(stochastic, lambda: stochastic_actions, + lambda: deterministic_actions) + + +def _build_q_network(registry, inputs, action_inputs, config): + frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"]) + + hiddens = config["critic_hiddens"] + + q_out = tf.concat([frontend.last_layer, action_inputs], axis=1) + for hidden in hiddens: + q_out = layers.fully_connected( + q_out, num_outputs=hidden, activation_fn=tf.nn.relu) + q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None) + + return q_scores + + +class DDPGPolicyGraph(TFPolicyGraph): + def __init__(self, observation_space, action_space, registry, config): + if not isinstance(action_space, Box): + raise UnsupportedSpaceException( + "Action space {} is not supported for DDPG.".format( + action_space)) + + self.config = config + self.cur_epsilon = 1.0 + dim_actions = action_space.shape[0] + low_action = action_space.low + high_action = action_space.high + self.actor_optimizer = tf.train.AdamOptimizer( + learning_rate=config["actor_lr"]) + self.critic_optimizer = tf.train.AdamOptimizer( + learning_rate=config["critic_lr"]) + + # Action inputs + self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") + self.eps = tf.placeholder(tf.float32, (), name="eps") + self.cur_observations = tf.placeholder( + tf.float32, shape=(None, ) + observation_space.shape) + + # Actor: P (policy) network + with tf.variable_scope(P_SCOPE) as scope: + p_values = _build_p_network(registry, self.cur_observations, + dim_actions, config) + self.p_func_vars = _scope_vars(scope.name) + + # Action outputs + with tf.variable_scope(A_SCOPE): + self.output_actions = _build_action_network( + p_values, low_action, high_action, self.stochastic, self.eps, + config["exploration_theta"], config["exploration_sigma"]) + + with tf.variable_scope(A_SCOPE, reuse=True): + exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") + self.reset_noise_op = tf.assign(exploration_sample, + dim_actions * [.0]) + + # Replay inputs + self.obs_t = tf.placeholder( + tf.float32, + shape=(None, ) + observation_space.shape, + name="observation") + self.act_t = tf.placeholder( + tf.float32, shape=(None, ) + action_space.shape, name="action") + self.rew_t = tf.placeholder(tf.float32, [None], name="reward") + self.obs_tp1 = tf.placeholder( + tf.float32, shape=(None, ) + observation_space.shape) + self.done_mask = tf.placeholder(tf.float32, [None], name="done") + self.importance_weights = tf.placeholder( + tf.float32, [None], name="weight") + + # p network evaluation + with tf.variable_scope(P_SCOPE, reuse=True) as scope: + self.p_t = _build_p_network( + registry, self.obs_t, dim_actions, config) + + # target p network evaluation + with tf.variable_scope(P_TARGET_SCOPE) as scope: + p_tp1 = _build_p_network( + registry, self.obs_tp1, dim_actions, config) + target_p_func_vars = _scope_vars(scope.name) + + # Action outputs + with tf.variable_scope(A_SCOPE, reuse=True): + deterministic_flag = tf.constant(value=False, dtype=tf.bool) + zero_eps = tf.constant(value=.0, dtype=tf.float32) + output_actions = _build_action_network( + self.p_t, low_action, high_action, deterministic_flag, + zero_eps, config["exploration_theta"], + config["exploration_sigma"]) + + output_actions_estimated = _build_action_network( + p_tp1, low_action, high_action, deterministic_flag, + zero_eps, config["exploration_theta"], + config["exploration_sigma"]) + + # q network evaluation + with tf.variable_scope(Q_SCOPE) as scope: + q_t = _build_q_network( + registry, self.obs_t, self.act_t, config) + self.q_func_vars = _scope_vars(scope.name) + with tf.variable_scope(Q_SCOPE, reuse=True): + q_tp0 = _build_q_network( + registry, self.obs_t, output_actions, config) + + # target q network evalution + with tf.variable_scope(Q_TARGET_SCOPE) as scope: + q_tp1 = _build_q_network( + registry, self.obs_tp1, output_actions_estimated, config) + target_q_func_vars = _scope_vars(scope.name) + + q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) + + q_tp1_best = tf.squeeze( + input=q_tp1, axis=len(q_tp1.shape) - 1) + q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best + + # compute RHS of bellman equation + q_t_selected_target = ( + self.rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked) + + # compute the error (potentially clipped) + self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) + if config.get("use_huber"): + errors = _huber_loss(self.td_error, config.get("huber_threshold")) + else: + errors = 0.5 * tf.square(self.td_error) + + self.loss = tf.reduce_mean(self.importance_weights * errors) + + # for policy gradient + self.actor_loss = -1.0 * tf.reduce_mean(q_tp0) + + if config["l2_reg"] is not None: + for var in self.p_func_vars: + if "bias" not in var.name: + self.actor_loss += ( + config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) + for var in self.q_func_vars: + if "bias" not in var.name: + self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss( + var) + + # update_target_fn will be called periodically to copy Q network to + # target Q network + self.tau_value = config.get("tau") + self.tau = tf.placeholder(tf.float32, (), name="tau") + update_target_expr = [] + for var, var_target in zip( + sorted(self.q_func_vars, key=lambda v: v.name), + sorted(target_q_func_vars, key=lambda v: v.name)): + update_target_expr.append( + var_target.assign(self.tau * var + + (1.0 - self.tau) * var_target)) + for var, var_target in zip( + sorted(self.p_func_vars, key=lambda v: v.name), + sorted(target_p_func_vars, key=lambda v: v.name)): + update_target_expr.append( + var_target.assign(self.tau * var + + (1.0 - self.tau) * var_target)) + self.update_target_expr = tf.group(*update_target_expr) + + self.sess = tf.get_default_session() + self.loss_inputs = [ + ("obs", self.obs_t), + ("actions", self.act_t), + ("rewards", self.rew_t), + ("new_obs", self.obs_tp1), + ("dones", self.done_mask), + ("weights", self.importance_weights), + ] + self.is_training = tf.placeholder_with_default(True, ()) + TFPolicyGraph.__init__( + self, self.sess, obs_input=self.cur_observations, + action_sampler=self.output_actions, loss=self.loss, + loss_inputs=self.loss_inputs, is_training=self.is_training) + self.sess.run(tf.global_variables_initializer()) + + # Note that this encompasses both the policy and Q-value networks and + # their corresponding target networks + self.variables = ray.experimental.TensorFlowVariables( + tf.group(q_tp0, q_tp1), self.sess) + + # Hard initial update + self.update_target(tau=1.0) + + def gradients(self, optimizer): + if self.config["grad_norm_clipping"] is not None: + actor_grads_and_vars = _minimize_and_clip( + self.actor_optimizer, + self.actor_loss, + var_list=self.p_func_vars, + clip_val=self.config["grad_norm_clipping"]) + critic_grads_and_vars = _minimize_and_clip( + self.critic_optimizer, + self.loss, + var_list=self.q_func_vars, + clip_val=self.config["grad_norm_clipping"]) + else: + actor_grads_and_vars = self.actor_optimizer.compute_gradients( + self.actor_loss, var_list=self.p_func_vars) + critic_grads_and_vars = self.critic_optimizer.compute_gradients( + self.loss, var_list=self.q_func_vars) + actor_grads_and_vars = [ + (g, v) for (g, v) in actor_grads_and_vars if g is not None] + critic_grads_and_vars = [ + (g, v) for (g, v) in critic_grads_and_vars if g is not None] + grads_and_vars = actor_grads_and_vars + critic_grads_and_vars + return grads_and_vars + + def extra_compute_action_feed_dict(self): + return { + self.stochastic: True, + self.eps: self.cur_epsilon, + } + + def extra_compute_grad_fetches(self): + return { + "td_error": self.td_error, + } + + def postprocess_trajectory(self, sample_batch, other_agent_batches=None): + return _postprocess_dqn(self, sample_batch) + + def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask, + importance_weights): + td_err = self.sess.run( + self.td_error, + feed_dict={ + self.obs_t: [np.array(ob) for ob in obs_t], + self.act_t: act_t, + self.rew_t: rew_t, + self.obs_tp1: [np.array(ob) for ob in obs_tp1], + self.done_mask: done_mask, + self.importance_weights: importance_weights + }) + return td_err + + def reset_noise(self, sess): + sess.run(self.reset_noise_op) + + # support both hard and soft sync + def update_target(self, tau=None): + return self.sess.run( + self.update_target_expr, + feed_dict={self.tau: tau or self.tau_value}) + + def set_epsilon(self, epsilon): + self.cur_epsilon = epsilon + + def get_weights(self): + return self.variables.get_weights() + + def set_weights(self, weights): + self.variables.set_weights(weights) + + def get_state(self): + return [TFPolicyGraph.get_state(self), self.cur_epsilon] + + def set_state(self, state): + TFPolicyGraph.set_state(self, state[0]) + self.set_epsilon(state[1]) diff --git a/python/ray/rllib/ddpg/models.py b/python/ray/rllib/ddpg/models.py deleted file mode 100644 index d58f37dc6417..000000000000 --- a/python/ray/rllib/ddpg/models.py +++ /dev/null @@ -1,391 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - -import tensorflow as tf -import tensorflow.contrib.layers as layers - -from ray.rllib.models import ModelCatalog - - -def _build_p_network(registry, inputs, dim_actions, config): - """ - map an observation (i.e., state) to an action where - each entry takes value from (0, 1) due to the sigmoid function - """ - frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"]) - - hiddens = config["actor_hiddens"] - action_out = frontend.last_layer - for hidden in hiddens: - action_out = layers.fully_connected( - action_out, num_outputs=hidden, activation_fn=tf.nn.relu) - # Use sigmoid layer to bound values within (0, 1) - # shape of action_scores is [batch_size, dim_actions] - action_scores = layers.fully_connected( - action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid) - - return action_scores - - -# As a stochastic policy for inference, but a deterministic policy for training -# thus ignore batch_size issue when constructing a stochastic action -def _build_action_network(p_values, low_action, high_action, stochastic, eps, - theta, sigma): - # shape is [None, dim_action] - deterministic_actions = (high_action - low_action) * p_values + low_action - - exploration_sample = tf.get_variable( - name="ornstein_uhlenbeck", - dtype=tf.float32, - initializer=low_action.size * [.0], - trainable=False) - normal_sample = tf.random_normal( - shape=[low_action.size], mean=0.0, stddev=1.0) - exploration_value = tf.assign_add( - exploration_sample, - theta * (.0 - exploration_sample) + sigma * normal_sample) - stochastic_actions = deterministic_actions + eps * ( - high_action - low_action) * exploration_value - - return tf.cond(stochastic, lambda: stochastic_actions, - lambda: deterministic_actions) - - -def _build_q_network(registry, inputs, action_inputs, config): - frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"]) - - hiddens = config["critic_hiddens"] - - q_out = tf.concat([frontend.last_layer, action_inputs], axis=1) - for hidden in hiddens: - q_out = layers.fully_connected( - q_out, num_outputs=hidden, activation_fn=tf.nn.relu) - q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None) - - return q_scores - - -def _huber_loss(x, delta=1.0): - """Reference: https://en.wikipedia.org/wiki/Huber_loss""" - return tf.where( - tf.abs(x) < delta, - tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta)) - - -def _minimize_and_clip(optimizer, objective, var_list, clip_val=10): - """Minimized `objective` using `optimizer` w.r.t. variables in - `var_list` while ensure the norm of the gradients for each - variable is clipped to `clip_val` - """ - gradients = optimizer.compute_gradients(objective, var_list=var_list) - for i, (grad, var) in enumerate(gradients): - if grad is not None: - gradients[i] = (tf.clip_by_norm(grad, clip_val), var) - return gradients - - -def _scope_vars(scope, trainable_only=False): - """ - Get variables inside a scope - The scope can be specified as a string - - Parameters - ---------- - scope: str or VariableScope - scope in which the variables reside. - trainable_only: bool - whether or not to return only the variables that were marked as - trainable. - - Returns - ------- - vars: [tf.Variable] - list of variables in `scope`. - """ - return tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES - if trainable_only else tf.GraphKeys.VARIABLES, - scope=scope if isinstance(scope, str) else scope.name) - - -class ModelAndLoss(object): - """Holds the model and loss function. - - Both graphs are necessary in order for the multi-gpu SGD implementation - to create towers on each device. - """ - - def __init__(self, registry, dim_actions, low_action, high_action, config, - obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights): - # p network evaluation - with tf.variable_scope("p_func", reuse=True) as scope: - self.p_t = _build_p_network(registry, obs_t, dim_actions, config) - - # target p network evaluation - with tf.variable_scope("target_p_func") as scope: - self.p_tp1 = _build_p_network(registry, obs_tp1, dim_actions, - config) - self.target_p_func_vars = _scope_vars(scope.name) - - # Action outputs - with tf.variable_scope("a_func", reuse=True): - deterministic_flag = tf.constant(value=False, dtype=tf.bool) - zero_eps = tf.constant(value=.0, dtype=tf.float32) - output_actions = _build_action_network( - self.p_t, low_action, high_action, deterministic_flag, - zero_eps, config["exploration_theta"], - config["exploration_sigma"]) - - output_actions_estimated = _build_action_network( - self.p_tp1, low_action, high_action, deterministic_flag, - zero_eps, config["exploration_theta"], - config["exploration_sigma"]) - - # q network evaluation - with tf.variable_scope("q_func") as scope: - self.q_t = _build_q_network(registry, obs_t, act_t, config) - self.q_func_vars = _scope_vars(scope.name) - with tf.variable_scope("q_func", reuse=True): - self.q_tp0 = _build_q_network(registry, obs_t, output_actions, - config) - - # target q network evalution - with tf.variable_scope("target_q_func") as scope: - self.q_tp1 = _build_q_network(registry, obs_tp1, - output_actions_estimated, config) - self.target_q_func_vars = _scope_vars(scope.name) - - q_t_selected = tf.squeeze(self.q_t, axis=len(self.q_t.shape) - 1) - - q_tp1_best = tf.squeeze( - input=self.q_tp1, axis=len(self.q_tp1.shape) - 1) - q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best - - # compute RHS of bellman equation - q_t_selected_target = ( - rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked) - - # compute the error (potentially clipped) - self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) - if config.get("use_huber"): - errors = _huber_loss(self.td_error, config.get("huber_threshold")) - else: - errors = 0.5 * tf.square(self.td_error) - - weighted_error = tf.reduce_mean(importance_weights * errors) - - self.loss = weighted_error - - # for policy gradient - self.actor_loss = -1.0 * tf.reduce_mean(self.q_tp0) - - -class DDPGGraph(object): - def __init__(self, registry, env, config, logdir): - self.env = env - dim_actions = env.action_space.shape[0] - low_action = env.action_space.low - high_action = env.action_space.high - actor_optimizer = tf.train.AdamOptimizer( - learning_rate=config["actor_lr"]) - critic_optimizer = tf.train.AdamOptimizer( - learning_rate=config["critic_lr"]) - - # Action inputs - self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") - self.eps = tf.placeholder(tf.float32, (), name="eps") - self.cur_observations = tf.placeholder( - tf.float32, shape=(None, ) + env.observation_space.shape) - - # Actor: P (policy) network - p_scope_name = "p_func" - with tf.variable_scope(p_scope_name) as scope: - p_values = _build_p_network(registry, self.cur_observations, - dim_actions, config) - p_func_vars = _scope_vars(scope.name) - - # Action outputs - a_scope_name = "a_func" - with tf.variable_scope(a_scope_name): - self.output_actions = _build_action_network( - p_values, low_action, high_action, self.stochastic, self.eps, - config["exploration_theta"], config["exploration_sigma"]) - - with tf.variable_scope(a_scope_name, reuse=True): - exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") - self.reset_noise_op = tf.assign(exploration_sample, - dim_actions * [.0]) - - # Replay inputs - self.obs_t = tf.placeholder( - tf.float32, - shape=(None, ) + env.observation_space.shape, - name="observation") - self.act_t = tf.placeholder( - tf.float32, shape=(None, ) + env.action_space.shape, name="action") - self.rew_t = tf.placeholder(tf.float32, [None], name="reward") - self.obs_tp1 = tf.placeholder( - tf.float32, shape=(None, ) + env.observation_space.shape) - self.done_mask = tf.placeholder(tf.float32, [None], name="done") - self.importance_weights = tf.placeholder( - tf.float32, [None], name="weight") - - def build_loss(obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - return ModelAndLoss(registry, dim_actions, low_action, high_action, - config, obs_t, act_t, rew_t, obs_tp1, - done_mask, importance_weights) - - self.loss_inputs = [ - ("obs", self.obs_t), - ("actions", self.act_t), - ("rewards", self.rew_t), - ("new_obs", self.obs_tp1), - ("dones", self.done_mask), - ("weights", self.importance_weights), - ] - - loss_obj = build_loss(self.obs_t, self.act_t, self.rew_t, self.obs_tp1, - self.done_mask, self.importance_weights) - - self.build_loss = build_loss - - actor_loss = loss_obj.actor_loss - weighted_error = loss_obj.loss - q_func_vars = loss_obj.q_func_vars - target_p_func_vars = loss_obj.target_p_func_vars - target_q_func_vars = loss_obj.target_q_func_vars - self.p_t = loss_obj.p_t - self.q_t = loss_obj.q_t - self.q_tp0 = loss_obj.q_tp0 - self.q_tp1 = loss_obj.q_tp1 - self.td_error = loss_obj.td_error - - if config["l2_reg"] is not None: - for var in p_func_vars: - if "bias" not in var.name: - actor_loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(var) - for var in q_func_vars: - if "bias" not in var.name: - weighted_error += config["l2_reg"] * 0.5 * tf.nn.l2_loss( - var) - - # compute optimization op (potentially with gradient clipping) - if config["grad_norm_clipping"] is not None: - self.actor_grads_and_vars = _minimize_and_clip( - actor_optimizer, - actor_loss, - var_list=p_func_vars, - clip_val=config["grad_norm_clipping"]) - self.critic_grads_and_vars = _minimize_and_clip( - critic_optimizer, - weighted_error, - var_list=q_func_vars, - clip_val=config["grad_norm_clipping"]) - else: - self.actor_grads_and_vars = actor_optimizer.compute_gradients( - actor_loss, var_list=p_func_vars) - self.critic_grads_and_vars = critic_optimizer.compute_gradients( - weighted_error, var_list=q_func_vars) - self.actor_grads_and_vars = [(g, v) - for (g, v) in self.actor_grads_and_vars - if g is not None] - self.critic_grads_and_vars = [(g, v) - for (g, v) in self.critic_grads_and_vars - if g is not None] - self.grads_and_vars = ( - self.actor_grads_and_vars + self.critic_grads_and_vars) - self.grads = [g for (g, v) in self.grads_and_vars] - self.actor_train_expr = actor_optimizer.apply_gradients( - self.actor_grads_and_vars) - self.critic_train_expr = critic_optimizer.apply_gradients( - self.critic_grads_and_vars) - - # update_target_fn will be called periodically to copy Q network to - # target Q network - self.tau_value = config.get("tau") - self.tau = tf.placeholder(tf.float32, (), name="tau") - update_target_expr = [] - for var, var_target in zip( - sorted(q_func_vars, key=lambda v: v.name), - sorted(target_q_func_vars, key=lambda v: v.name)): - update_target_expr.append( - var_target.assign(self.tau * var + - (1.0 - self.tau) * var_target)) - for var, var_target in zip( - sorted(p_func_vars, key=lambda v: v.name), - sorted(target_p_func_vars, key=lambda v: v.name)): - update_target_expr.append( - var_target.assign(self.tau * var + - (1.0 - self.tau) * var_target)) - self.update_target_expr = tf.group(*update_target_expr) - - # support both hard and soft sync - def update_target(self, sess, tau=None): - return sess.run( - self.update_target_expr, - feed_dict={self.tau: tau or self.tau_value}) - - def act(self, sess, obs, eps, stochastic=True): - return sess.run( - self.output_actions, - feed_dict={ - self.cur_observations: obs, - self.stochastic: stochastic, - self.eps: eps - }) - - def compute_gradients(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - td_err, grads = sess.run( - [self.td_error, self.grads], - feed_dict={ - self.obs_t: obs_t, - self.act_t: act_t, - self.rew_t: rew_t, - self.obs_tp1: obs_tp1, - self.done_mask: done_mask, - self.importance_weights: importance_weights - }) - return td_err, grads - - def compute_td_error(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - td_err = sess.run( - self.td_error, - feed_dict={ - self.obs_t: [np.array(ob) for ob in obs_t], - self.act_t: act_t, - self.rew_t: rew_t, - self.obs_tp1: [np.array(ob) for ob in obs_tp1], - self.done_mask: done_mask, - self.importance_weights: importance_weights - }) - return td_err - - def apply_gradients(self, sess, grads): - assert len(grads) == len(self.grads_and_vars) - feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)} - sess.run( - [self.critic_train_expr, self.actor_train_expr], - feed_dict=feed_dict) - - def compute_apply(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - td_err, _, _ = sess.run( - [self.td_error, self.critic_train_expr, self.actor_train_expr], - feed_dict={ - self.obs_t: obs_t, - self.act_t: act_t, - self.rew_t: rew_t, - self.obs_tp1: obs_tp1, - self.done_mask: done_mask, - self.importance_weights: importance_weights - }) - return td_err - - def reset_noise(self, sess): - sess.run(self.reset_noise_op) diff --git a/python/ray/rllib/ddpg2/ddpg_evaluator.py b/python/ray/rllib/ddpg2/ddpg_evaluator.py index 8a5ab5ed3f3a..5ba71028ce89 100644 --- a/python/ray/rllib/ddpg2/ddpg_evaluator.py +++ b/python/ray/rllib/ddpg2/ddpg_evaluator.py @@ -9,7 +9,7 @@ from ray.rllib.models.catalog import ModelCatalog from ray.rllib.optimizers import PolicyEvaluator from ray.rllib.utils.filter import NoFilter -from ray.rllib.utils.process_rollout import process_rollout +from ray.rllib.utils.process_rollout import compute_advantages from ray.rllib.utils.sampler import SyncSampler @@ -34,9 +34,7 @@ def sample(self): # since each sample is one step, no discounting needs to be applied; # this does not involve config["gamma"] - samples = process_rollout( - rollout, NoFilter(), - gamma=1.0, use_gae=False) + samples = compute_advantages(rollout, 0.0, gamma=1.0, use_gae=False) return samples diff --git a/python/ray/rllib/ddpg2/models.py b/python/ray/rllib/ddpg2/models.py index e785f518f541..855964ffb19c 100644 --- a/python/ray/rllib/ddpg2/models.py +++ b/python/ray/rllib/ddpg2/models.py @@ -227,7 +227,7 @@ def set_weights(self, weights): self.critic_vars.set_weights(critic_weights) self.actor_vars.set_weights(actor_weights) - def compute(self, ob): + def compute_single_action(self, ob, h, is_training): """Returns action, given state.""" flattened_ob = np.reshape(ob, [-1, np.prod(ob.shape)]) action = self.sess.run(self.output_action, {self.obs: flattened_ob}) @@ -235,7 +235,10 @@ def compute(self, ob): action += self.epsilon * self.rand_process.sample() if (self.epsilon > 0): self.epsilon -= self.config["noise_epsilon"] - return action[0], {} + return action[0], [], {} def value(self, *args): return 0 + + def get_initial_state(self): + return [] diff --git a/python/ray/rllib/dqn/apex.py b/python/ray/rllib/dqn/apex.py index 6de53203770c..34e6ecd912ef 100644 --- a/python/ray/rllib/dqn/apex.py +++ b/python/ray/rllib/dqn/apex.py @@ -9,26 +9,26 @@ APEX_DEFAULT_CONFIG = merge_dicts( DQN_CONFIG, { - 'optimizer_class': 'ApexOptimizer', - 'optimizer_config': + "optimizer_class": "ApexOptimizer", + "optimizer_config": merge_dicts( - DQN_CONFIG['optimizer_config'], { - 'max_weight_sync_delay': 400, - 'num_replay_buffer_shards': 4, - 'debug': False + DQN_CONFIG["optimizer_config"], { + "max_weight_sync_delay": 400, + "num_replay_buffer_shards": 4, + "debug": False }), - 'n_step': 3, - 'gpu': True, - 'num_workers': 32, - 'buffer_size': 2000000, - 'learning_starts': 50000, - 'train_batch_size': 512, - 'sample_batch_size': 50, - 'max_weight_sync_delay': 400, - 'target_network_update_freq': 500000, - 'timesteps_per_iteration': 25000, - 'per_worker_exploration': True, - 'worker_side_prioritization': True, + "n_step": 3, + "gpu": True, + "num_workers": 32, + "buffer_size": 2000000, + "learning_starts": 50000, + "train_batch_size": 512, + "sample_batch_size": 50, + "max_weight_sync_delay": 400, + "target_network_update_freq": 500000, + "timesteps_per_iteration": 25000, + "per_worker_exploration": True, + "worker_side_prioritization": True, }, ) diff --git a/python/ray/rllib/dqn/dqn.py b/python/ray/rllib/dqn/dqn.py index 0bf99cf1ff2a..9d2f698cf162 100644 --- a/python/ray/rllib/dqn/dqn.py +++ b/python/ray/rllib/dqn/dqn.py @@ -5,14 +5,13 @@ import pickle import os -import numpy as np -import tensorflow as tf - import ray from ray.rllib import optimizers -from ray.rllib.dqn.dqn_evaluator import DQNEvaluator +from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule +from ray.rllib.dqn.dqn_policy_graph import DQNPolicyGraph +from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \ + collect_metrics from ray.rllib.agent import Agent -from ray.tune.result import TrainingResult from ray.tune.trial import Resources @@ -24,101 +23,84 @@ DEFAULT_CONFIG = { # === Model === # Whether to use dueling dqn - 'dueling': True, + "dueling": True, # Whether to use double dqn - 'double_q': True, + "double_q": True, # Hidden layer sizes of the state and action value networks - 'hiddens': [256], + "hiddens": [256], # N-step Q learning - 'n_step': 1, + "n_step": 1, # Config options to pass to the model constructor - 'model': {}, + "model": {}, # Discount factor for the MDP - 'gamma': 0.99, + "gamma": 0.99, # Arguments to pass to the env creator - 'env_config': {}, + "env_config": {}, # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction - 'schedule_max_timesteps': 100000, + "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning - 'timesteps_per_iteration': 1000, + "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed - 'exploration_fraction': 0.1, + "exploration_fraction": 0.1, # Final value of random action probability - 'exploration_final_eps': 0.02, + "exploration_final_eps": 0.02, # Update the target network every `target_network_update_freq` steps. - 'target_network_update_freq': 500, - # Whether to start with random actions instead of noops. - 'random_starts': True, + "target_network_update_freq": 500, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. - 'buffer_size': 50000, + "buffer_size": 50000, # If True prioritized replay buffer will be used. - 'prioritized_replay': True, + "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. - 'prioritized_replay_alpha': 0.6, + "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. - 'prioritized_replay_beta': 0.4, + "prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. - 'prioritized_replay_eps': 1e-6, + "prioritized_replay_eps": 1e-6, # Whether to clip rewards to [-1, 1] prior to adding to the replay buffer. - 'clip_rewards': True, + "clip_rewards": True, # === Optimization === # Learning rate for adam optimizer - 'lr': 5e-4, + "lr": 5e-4, # If not None, clip gradients during optimization at this value - 'grad_norm_clipping': 40, + "grad_norm_clipping": 40, # How many steps of the model to sample before learning starts. - 'learning_starts': 1000, + "learning_starts": 1000, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. - 'sample_batch_size': 4, + "sample_batch_size": 4, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. - 'train_batch_size': 32, - # Smooth the current average reward over this many previous episodes. - 'smoothing_num_episodes': 100, - - # === Tensorflow === - # Arguments to pass to tensorflow - 'tf_session_args': { - "device_count": {"CPU": 2}, - "log_device_placement": False, - "allow_soft_placement": True, - "gpu_options": { - "allow_growth": True - }, - "inter_op_parallelism_threads": 1, - "intra_op_parallelism_threads": 1, - }, + "train_batch_size": 32, # === Parallelism === # Whether to use a GPU for local optimization. - 'gpu': False, + "gpu": False, # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if - # you're using the Async or Ape-X optimizers. - 'num_workers': 0, + # you"re using the Async or Ape-X optimizers. + "num_workers": 0, # Whether to allocate GPUs for workers (if > 0). - 'num_gpus_per_worker': 0, + "num_gpus_per_worker": 0, # Whether to allocate CPUs for workers (if > 0). - 'num_cpus_per_worker': 1, + "num_cpus_per_worker": 1, # Optimizer class to use. - 'optimizer_class': "LocalSyncReplayOptimizer", + "optimizer_class": "LocalSyncReplayOptimizer", # Config to pass to the optimizer. - 'optimizer_config': {}, + "optimizer_config": {}, # Whether to use a distribution of epsilons across workers for exploration. - 'per_worker_exploration': False, + "per_worker_exploration": False, # Whether to compute priorities on workers. - 'worker_side_prioritization': False + "worker_side_prioritization": False } @@ -127,6 +109,7 @@ class DQNAgent(Agent): _allow_unknown_subkeys = [ "model", "optimizer", "tf_session_args", "env_config"] _default_config = DEFAULT_CONFIG + _policy_graph = DQNPolicyGraph @classmethod def default_resource_request(cls, config): @@ -137,16 +120,31 @@ def default_resource_request(cls, config): extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): - self.local_evaluator = DQNEvaluator( - self.registry, self.env_creator, self.config, self.logdir, 0) - remote_cls = ray.remote( + adjusted_batch_size = ( + self.config["sample_batch_size"] + self.config["n_step"] - 1) + self.local_evaluator = CommonPolicyEvaluator( + self.env_creator, self._policy_graph, + batch_steps=adjusted_batch_size, + batch_mode="pack_episodes", preprocessor_pref="deepmind", + compress_observations=True, + registry=self.registry, env_config=self.config["env_config"], + model_config=self.config["model"], policy_config=self.config) + remote_cls = CommonPolicyEvaluator.as_remote( num_cpus=self.config["num_cpus_per_worker"], - num_gpus=self.config["num_gpus_per_worker"])( - DQNEvaluator) + num_gpus=self.config["num_gpus_per_worker"]) self.remote_evaluators = [ remote_cls.remote( - self.registry, self.env_creator, self.config, self.logdir, - i) + self.env_creator, self._policy_graph, + batch_steps=adjusted_batch_size, + batch_mode="pack_episodes", preprocessor_pref="deepmind", + compress_observations=True, + registry=self.registry, env_config=self.config["env_config"], + model_config=self.config["model"], policy_config=self.config) + for _ in range(self.config["num_workers"])] + + self.exploration0 = self._make_exploration_schedule(0) + self.explorations = [ + self._make_exploration_schedule(i) for i in range(self.config["num_workers"])] for k in OPTIMIZER_SHARED_CONFIGS: @@ -157,10 +155,25 @@ def _init(self): self.config["optimizer_config"], self.local_evaluator, self.remote_evaluators) - self.saver = tf.train.Saver(max_to_keep=None) self.last_target_update_ts = 0 self.num_target_updates = 0 + def _make_exploration_schedule(self, worker_index): + # Use either a different `eps` per worker, or a linear schedule. + if self.config["per_worker_exploration"]: + assert self.config["num_workers"] > 1, \ + "This requires multiple workers" + return ConstantSchedule( + 0.4 ** ( + 1 + worker_index / float( + self.config["num_workers"] - 1) * 7)) + return LinearSchedule( + schedule_timesteps=int( + self.config["exploration_fraction"] * + self.config["schedule_max_timesteps"]), + initial_p=1.0, + final_p=self.config["exploration_final_eps"]) + @property def global_timestep(self): return self.optimizer.num_steps_sampled @@ -168,7 +181,7 @@ def global_timestep(self): def update_target_if_needed(self): if self.global_timestep - self.last_target_update_ts > \ self.config["target_network_update_freq"]: - self.local_evaluator.update_target() + self.local_evaluator.for_policy(lambda p: p.update_target()) self.last_target_update_ts = self.global_timestep self.num_target_updates += 1 @@ -177,58 +190,25 @@ def _train(self): while (self.global_timestep - start_timestep < self.config["timesteps_per_iteration"]): - self.optimizer.step() self.update_target_if_needed() - self.local_evaluator.set_global_timestep(self.global_timestep) - for e in self.remote_evaluators: - e.set_global_timestep.remote(self.global_timestep) - - return self._train_stats(start_timestep) - - def _train_stats(self, start_timestep): - if self.remote_evaluators: - stats = ray.get([ - e.stats.remote() for e in self.remote_evaluators]) - else: - stats = self.local_evaluator.stats() - if not isinstance(stats, list): - stats = [stats] - - mean_100ep_reward = 0.0 - mean_100ep_length = 0.0 - num_episodes = 0 - explorations = [] - - if self.config["per_worker_exploration"]: - # Return stats from workers with the lowest 20% of exploration - test_stats = stats[-int(max(1, len(stats)*0.2)):] - else: - test_stats = stats - - for s in test_stats: - mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats) - mean_100ep_length += s["mean_100ep_length"] / len(test_stats) - - for s in stats: - num_episodes += s["num_episodes"] - explorations.append(s["exploration"]) - - opt_stats = self.optimizer.stats() - - result = TrainingResult( - episode_reward_mean=mean_100ep_reward, - episode_len_mean=mean_100ep_length, - episodes_total=num_episodes, - timesteps_this_iter=self.global_timestep - start_timestep, + exp_vals = [self.exploration0.value(self.global_timestep)] + self.local_evaluator.for_policy( + lambda p: p.set_epsilon(exp_vals[0])) + for i, e in enumerate(self.remote_evaluators): + exp_val = self.explorations[i].value(self.global_timestep) + e.for_policy.remote(lambda p: p.set_epsilon(exp_val)) + exp_vals.append(exp_val) + + result = collect_metrics( + self.local_evaluator, self.remote_evaluators) + return result._replace( info=dict({ - "min_exploration": min(explorations), - "max_exploration": max(explorations), + "min_exploration": min(exp_vals), + "max_exploration": max(exp_vals), "num_target_updates": self.num_target_updates, - }, **opt_stats)) - - return result + }, **self.optimizer.stats())) def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 @@ -236,10 +216,8 @@ def _stop(self): ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): - checkpoint_path = self.saver.save( - self.local_evaluator.sess, - os.path.join(checkpoint_dir, "checkpoint"), - global_step=self.iteration) + checkpoint_path = os.path.join( + checkpoint_dir, "checkpoint-{}".format(self.iteration)) extra_data = [ self.local_evaluator.save(), ray.get([e.save.remote() for e in self.remote_evaluators]), @@ -250,7 +228,6 @@ def _save(self, checkpoint_dir): return checkpoint_path def _restore(self, checkpoint_path): - self.saver.restore(self.local_evaluator.sess, checkpoint_path) extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb")) self.local_evaluator.restore(extra_data[0]) ray.get([ @@ -260,6 +237,9 @@ def _restore(self, checkpoint_path): self.num_target_updates = extra_data[3] self.last_target_update_ts = extra_data[4] - def compute_action(self, observation): - return self.local_evaluator.dqn_graph.act( - self.local_evaluator.sess, np.array(observation)[None], 0.0)[0] + def compute_action(self, observation, state=None): + if state is None: + state = [] + return self.local_evaluator.for_policy( + lambda p: p.compute_single_action( + observation, state, is_training=False)[0]) diff --git a/python/ray/rllib/dqn/dqn_evaluator.py b/python/ray/rllib/dqn/dqn_evaluator.py deleted file mode 100644 index 758dc5f819d4..000000000000 --- a/python/ray/rllib/dqn/dqn_evaluator.py +++ /dev/null @@ -1,207 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from gym.spaces import Discrete -import numpy as np -import tensorflow as tf - -import ray -from ray.rllib.utils.error import UnsupportedSpaceException -from ray.rllib.dqn import models -from ray.rllib.dqn.common.wrappers import wrap_dqn -from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule -from ray.rllib.optimizers import SampleBatch, PolicyEvaluator -from ray.rllib.utils.compression import pack - - -def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones): - """Rewrites the given trajectory fragments to encode n-step rewards. - - reward[i] = ( - reward[i] * gamma**0 + - reward[i+1] * gamma**1 + - ... + - reward[i+n_step-1] * gamma**(n_step-1)) - - The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs. - - If the episode finishes, the reward will be truncated. After this rewrite, - all the arrays will be shortened by (n_step - 1). - """ - for i in range(len(rewards) - n_step + 1): - if dones[i]: - continue # episode end - for j in range(1, n_step): - new_obs[i] = new_obs[i + j] - rewards[i] += gamma ** j * rewards[i + j] - if dones[i + j]: - break # episode end - # truncate ends of the trajectory - new_len = len(obs) - n_step + 1 - for arr in [obs, actions, rewards, new_obs, dones]: - del arr[new_len:] - - -class DQNEvaluator(PolicyEvaluator): - """The DQN Evaluator. - - TODO(rliaw): Support observation/reward filters?""" - - def __init__(self, registry, env_creator, config, logdir, worker_index): - env = env_creator(config["env_config"]) - env = wrap_dqn(registry, env, config["model"], config["random_starts"]) - self.env = env - self.config = config - - if not isinstance(env.action_space, Discrete): - raise UnsupportedSpaceException( - "Action space {} is not supported for DQN.".format( - env.action_space)) - - tf_config = tf.ConfigProto(**config["tf_session_args"]) - self.sess = tf.Session(config=tf_config) - self.dqn_graph = models.DQNGraph(registry, env, config, logdir) - - # Use either a different `eps` per worker, or a linear schedule. - if config["per_worker_exploration"]: - assert config["num_workers"] > 1, "This requires multiple workers" - self.exploration = ConstantSchedule( - 0.4 ** ( - 1 + worker_index / float(config["num_workers"] - 1) * 7)) - else: - self.exploration = LinearSchedule( - schedule_timesteps=int( - config["exploration_fraction"] * - config["schedule_max_timesteps"]), - initial_p=1.0, - final_p=config["exploration_final_eps"]) - - # Initialize the parameters and copy them to the target network. - self.sess.run(tf.global_variables_initializer()) - self.dqn_graph.update_target(self.sess) - self.global_timestep = 0 - self.local_timestep = 0 - - # Note that this encompasses both the Q and target network - self.variables = ray.experimental.TensorFlowVariables( - tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) - - self.episode_rewards = [0.0] - self.episode_lengths = [0.0] - self.saved_mean_reward = None - - self.obs = self.env.reset() - - def set_global_timestep(self, global_timestep): - self.global_timestep = global_timestep - - def update_target(self): - self.dqn_graph.update_target(self.sess) - - def sample(self): - obs, actions, rewards, new_obs, dones = [], [], [], [], [] - for _ in range( - self.config["sample_batch_size"] + self.config["n_step"] - 1): - ob, act, rew, ob1, done = self._step(self.global_timestep) - obs.append(ob) - actions.append(act) - rewards.append(rew) - new_obs.append(ob1) - dones.append(done) - - # N-step Q adjustments - if self.config["n_step"] > 1: - # Adjust for steps lost from truncation - self.local_timestep -= (self.config["n_step"] - 1) - adjust_nstep( - self.config["n_step"], self.config["gamma"], - obs, actions, rewards, new_obs, dones) - - batch = SampleBatch({ - "obs": [pack(np.array(o)) for o in obs], "actions": actions, - "rewards": rewards, - "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones, - "weights": np.ones_like(rewards)}) - assert (batch.count == self.config["sample_batch_size"]) - - # Prioritize on the worker side - if self.config["worker_side_prioritization"]: - td_errors = self.dqn_graph.compute_td_error( - self.sess, obs, batch["actions"], batch["rewards"], - new_obs, batch["dones"], batch["weights"]) - new_priorities = ( - np.abs(td_errors) + self.config["prioritized_replay_eps"]) - batch.data["weights"] = new_priorities - - return batch - - def compute_gradients(self, samples): - td_err, grads = self.dqn_graph.compute_gradients( - self.sess, samples["obs"], samples["actions"], samples["rewards"], - samples["new_obs"], samples["dones"], samples["weights"]) - return grads, {"td_error": td_err} - - def apply_gradients(self, grads): - self.dqn_graph.apply_gradients(self.sess, grads) - - def compute_apply(self, samples): - td_error = self.dqn_graph.compute_apply( - self.sess, samples["obs"], samples["actions"], samples["rewards"], - samples["new_obs"], samples["dones"], samples["weights"]) - return {"td_error": td_error} - - def get_weights(self): - return self.variables.get_weights() - - def set_weights(self, weights): - self.variables.set_weights(weights) - - def _step(self, global_timestep): - """Takes a single step, and returns the result of the step.""" - action = self.dqn_graph.act( - self.sess, np.array(self.obs)[None], - self.exploration.value(global_timestep))[0] - new_obs, rew, done, _ = self.env.step(action) - ret = (self.obs, action, rew, new_obs, float(done)) - self.obs = new_obs - self.episode_rewards[-1] += rew - self.episode_lengths[-1] += 1 - if done: - self.obs = self.env.reset() - self.episode_rewards.append(0.0) - self.episode_lengths.append(0.0) - self.local_timestep += 1 - return ret - - def stats(self): - n = self.config["smoothing_num_episodes"] + 1 - mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5) - mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5) - exploration = self.exploration.value(self.global_timestep) - return { - "mean_100ep_reward": mean_100ep_reward, - "mean_100ep_length": mean_100ep_length, - "num_episodes": len(self.episode_rewards), - "exploration": exploration, - "local_timestep": self.local_timestep, - } - - def save(self): - return [ - self.exploration, - self.episode_rewards, - self.episode_lengths, - self.saved_mean_reward, - self.obs, - self.global_timestep, - self.local_timestep] - - def restore(self, data): - self.exploration = data[0] - self.episode_rewards = data[1] - self.episode_lengths = data[2] - self.saved_mean_reward = data[3] - self.obs = data[4] - self.global_timestep = data[5] - self.local_timestep = data[6] diff --git a/python/ray/rllib/dqn/models.py b/python/ray/rllib/dqn/dqn_policy_graph.py similarity index 51% rename from python/ray/rllib/dqn/models.py rename to python/ray/rllib/dqn/dqn_policy_graph.py index 6629b6126acf..75c1d06f0e4e 100644 --- a/python/ray/rllib/dqn/models.py +++ b/python/ray/rllib/dqn/dqn_policy_graph.py @@ -2,13 +2,240 @@ from __future__ import division from __future__ import print_function +from gym.spaces import Discrete import numpy as np - import tensorflow as tf import tensorflow.contrib.layers as layers from ray.rllib.models import ModelCatalog -from ray.rllib.optimizers.multi_gpu_impl import TOWER_SCOPE_NAME +from ray.rllib.optimizers.sample_batch import SampleBatch +from ray.rllib.utils.error import UnsupportedSpaceException +from ray.rllib.utils.tf_policy_graph import TFPolicyGraph + + +Q_SCOPE = "q_func" +Q_TARGET_SCOPE = "target_q_func" + + +def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones): + """Rewrites the given trajectory fragments to encode n-step rewards. + + reward[i] = ( + reward[i] * gamma**0 + + reward[i+1] * gamma**1 + + ... + + reward[i+n_step-1] * gamma**(n_step-1)) + + The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs. + + If the episode finishes, the reward will be truncated. After this rewrite, + all the arrays will be shortened by (n_step - 1). + """ + for i in range(len(rewards) - n_step + 1): + if dones[i]: + continue # episode end + for j in range(1, n_step): + new_obs[i] = new_obs[i + j] + rewards[i] += gamma ** j * rewards[i + j] + if dones[i + j]: + break # episode end + # truncate ends of the trajectory + new_len = len(obs) - n_step + 1 + for arr in [obs, actions, rewards, new_obs, dones]: + del arr[new_len:] + + +class DQNPolicyGraph(TFPolicyGraph): + def __init__(self, observation_space, action_space, registry, config): + if not isinstance(action_space, Discrete): + raise UnsupportedSpaceException( + "Action space {} is not supported for DQN.".format( + action_space)) + + self.config = config + self.cur_epsilon = 1.0 + num_actions = action_space.n + + # Action inputs + self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") + self.eps = tf.placeholder(tf.float32, (), name="eps") + self.cur_observations = tf.placeholder( + tf.float32, shape=(None,) + observation_space.shape) + + # Action Q network + with tf.variable_scope(Q_SCOPE) as scope: + q_values = _build_q_network( + registry, self.cur_observations, num_actions, config) + self.q_func_vars = _scope_vars(scope.name) + + # Action outputs + self.output_actions = _build_action_network( + q_values, + self.cur_observations, + num_actions, + self.stochastic, + self.eps) + + # Replay inputs + self.obs_t = tf.placeholder( + tf.float32, shape=(None,) + observation_space.shape) + self.act_t = tf.placeholder(tf.int32, [None], name="action") + self.rew_t = tf.placeholder(tf.float32, [None], name="reward") + self.obs_tp1 = tf.placeholder( + tf.float32, shape=(None,) + observation_space.shape) + self.done_mask = tf.placeholder(tf.float32, [None], name="done") + self.importance_weights = tf.placeholder( + tf.float32, [None], name="weight") + + # q network evaluation + with tf.variable_scope(Q_SCOPE, reuse=True): + q_t = _build_q_network( + registry, self.obs_t, num_actions, config) + + # target q network evalution + with tf.variable_scope(Q_TARGET_SCOPE) as scope: + q_tp1 = _build_q_network( + registry, self.obs_tp1, num_actions, config) + self.target_q_func_vars = _scope_vars(scope.name) + + # q scores for actions which we know were selected in the given state. + q_t_selected = tf.reduce_sum( + q_t * tf.one_hot(self.act_t, num_actions), 1) + + # compute estimate of best possible value starting from state at t + 1 + if config["double_q"]: + with tf.variable_scope(Q_SCOPE, reuse=True): + q_tp1_using_online_net = _build_q_network( + registry, self.obs_tp1, num_actions, config) + q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) + q_tp1_best = tf.reduce_sum( + q_tp1 * tf.one_hot( + q_tp1_best_using_online_net, num_actions), 1) + else: + q_tp1_best = tf.reduce_max(q_tp1, 1) + q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best + + # compute RHS of bellman equation + q_t_selected_target = ( + self.rew_t + + config["gamma"] ** config["n_step"] * q_tp1_best_masked) + + # compute the error (potentially clipped) + self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) + self.loss = tf.reduce_mean( + self.importance_weights * _huber_loss(self.td_error)) + + # update_target_fn will be called periodically to copy Q network to + # target Q network + update_target_expr = [] + for var, var_target in zip( + sorted(self.q_func_vars, key=lambda v: v.name), + sorted(self.target_q_func_vars, key=lambda v: v.name)): + update_target_expr.append(var_target.assign(var)) + self.update_target_expr = tf.group(*update_target_expr) + + # initialize TFPolicyGraph + self.sess = tf.get_default_session() + self.loss_inputs = [ + ("obs", self.obs_t), + ("actions", self.act_t), + ("rewards", self.rew_t), + ("new_obs", self.obs_tp1), + ("dones", self.done_mask), + ("weights", self.importance_weights), + ] + self.is_training = tf.placeholder_with_default(True, ()) + TFPolicyGraph.__init__( + self, self.sess, obs_input=self.cur_observations, + action_sampler=self.output_actions, loss=self.loss, + loss_inputs=self.loss_inputs, is_training=self.is_training) + self.sess.run(tf.global_variables_initializer()) + + def optimizer(self): + return tf.train.AdamOptimizer(learning_rate=self.config["lr"]) + + def gradients(self, optimizer): + if self.config["grad_norm_clipping"] is not None: + grads_and_vars = _minimize_and_clip( + optimizer, self.loss, var_list=self.q_func_vars, + clip_val=self.config["grad_norm_clipping"]) + else: + grads_and_vars = optimizer.compute_gradients( + self.loss, var_list=self.q_func_vars) + grads_and_vars = [ + (g, v) for (g, v) in grads_and_vars if g is not None] + return grads_and_vars + + def extra_compute_action_feed_dict(self): + return { + self.stochastic: True, + self.eps: self.cur_epsilon, + } + + def extra_compute_grad_fetches(self): + return { + "td_error": self.td_error, + } + + def postprocess_trajectory(self, sample_batch, other_agent_batches=None): + return _postprocess_dqn(self, sample_batch) + + def compute_td_error( + self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights): + td_err = self.sess.run( + self.td_error, + feed_dict={ + self.obs_t: [np.array(ob) for ob in obs_t], + self.act_t: act_t, + self.rew_t: rew_t, + self.obs_tp1: [np.array(ob) for ob in obs_tp1], + self.done_mask: done_mask, + self.importance_weights: importance_weights + }) + return td_err + + def update_target(self): + return self.sess.run(self.update_target_expr) + + def set_epsilon(self, epsilon): + self.cur_epsilon = epsilon + + def get_state(self): + return [TFPolicyGraph.get_state(self), self.cur_epsilon] + + def set_state(self, state): + TFPolicyGraph.set_state(self, state[0]) + self.set_epsilon(state[1]) + + +def _postprocess_dqn(policy_graph, sample_batch): + obs, actions, rewards, new_obs, dones = [ + list(x) for x in sample_batch.columns( + ["obs", "actions", "rewards", "new_obs", "dones"])] + + # N-step Q adjustments + if policy_graph.config["n_step"] > 1: + adjust_nstep( + policy_graph.config["n_step"], policy_graph.config["gamma"], + obs, actions, rewards, new_obs, dones) + + batch = SampleBatch({ + "obs": obs, "actions": actions, "rewards": rewards, + "new_obs": new_obs, "dones": dones, + "weights": np.ones_like(rewards)}) + assert batch.count == policy_graph.config["sample_batch_size"], \ + (batch.count, policy_graph.config["sample_batch_size"]) + + # Prioritize on the worker side + if policy_graph.config["worker_side_prioritization"]: + td_errors = policy_graph.compute_td_error( + batch["obs"], batch["actions"], batch["rewards"], + batch["new_obs"], batch["dones"], batch["weights"]) + new_priorities = ( + np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"]) + batch.data["weights"] = new_priorities + + return batch def _build_q_network(registry, inputs, num_actions, config): @@ -98,205 +325,3 @@ def _scope_vars(scope, trainable_only=False): tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.VARIABLES, scope=scope if isinstance(scope, str) else scope.name) - - -class ModelAndLoss(object): - """Holds the model and loss function. - - Both graphs are necessary in order for the multi-gpu SGD implementation - to create towers on each device. - """ - - def __init__( - self, registry, num_actions, config, - obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights): - # q network evaluation - with tf.variable_scope("q_func", reuse=True): - self.q_t = _build_q_network(registry, obs_t, num_actions, config) - - # target q network evalution - with tf.variable_scope("target_q_func") as scope: - self.q_tp1 = _build_q_network( - registry, obs_tp1, num_actions, config) - self.target_q_func_vars = _scope_vars(scope.name) - - # q scores for actions which we know were selected in the given state. - q_t_selected = tf.reduce_sum( - self.q_t * tf.one_hot(act_t, num_actions), 1) - - # compute estimate of best possible value starting from state at t + 1 - if config["double_q"]: - with tf.variable_scope("q_func", reuse=True): - q_tp1_using_online_net = _build_q_network( - registry, obs_tp1, num_actions, config) - q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) - q_tp1_best = tf.reduce_sum( - self.q_tp1 * tf.one_hot( - q_tp1_best_using_online_net, num_actions), 1) - else: - q_tp1_best = tf.reduce_max(self.q_tp1, 1) - q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best - - # compute RHS of bellman equation - q_t_selected_target = ( - rew_t + config["gamma"] ** config["n_step"] * q_tp1_best_masked) - - # compute the error (potentially clipped) - self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) - errors = _huber_loss(self.td_error) - - weighted_error = tf.reduce_mean(importance_weights * errors) - - self.loss = weighted_error - - -class DQNGraph(object): - def __init__(self, registry, env, config, logdir): - self.env = env - num_actions = env.action_space.n - optimizer = tf.train.AdamOptimizer(learning_rate=config["lr"]) - - # Action inputs - self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") - self.eps = tf.placeholder(tf.float32, (), name="eps") - self.cur_observations = tf.placeholder( - tf.float32, shape=(None,) + env.observation_space.shape) - - # Action Q network - q_scope_name = TOWER_SCOPE_NAME + "/q_func" - with tf.variable_scope(q_scope_name) as scope: - q_values = _build_q_network( - registry, self.cur_observations, num_actions, config) - q_func_vars = _scope_vars(scope.name) - - # Action outputs - self.output_actions = _build_action_network( - q_values, - self.cur_observations, - num_actions, - self.stochastic, - self.eps) - - # Replay inputs - self.obs_t = tf.placeholder( - tf.float32, shape=(None,) + env.observation_space.shape) - self.act_t = tf.placeholder(tf.int32, [None], name="action") - self.rew_t = tf.placeholder(tf.float32, [None], name="reward") - self.obs_tp1 = tf.placeholder( - tf.float32, shape=(None,) + env.observation_space.shape) - self.done_mask = tf.placeholder(tf.float32, [None], name="done") - self.importance_weights = tf.placeholder( - tf.float32, [None], name="weight") - - def build_loss( - obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights): - return ModelAndLoss( - registry, - num_actions, config, - obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights) - - self.loss_inputs = [ - ("obs", self.obs_t), - ("actions", self.act_t), - ("rewards", self.rew_t), - ("new_obs", self.obs_tp1), - ("dones", self.done_mask), - ("weights", self.importance_weights), - ] - - with tf.variable_scope(TOWER_SCOPE_NAME): - loss_obj = build_loss( - self.obs_t, self.act_t, self.rew_t, self.obs_tp1, - self.done_mask, self.importance_weights) - - self.build_loss = build_loss - - weighted_error = loss_obj.loss - target_q_func_vars = loss_obj.target_q_func_vars - self.q_t = loss_obj.q_t - self.q_tp1 = loss_obj.q_tp1 - self.td_error = loss_obj.td_error - - # compute optimization op (potentially with gradient clipping) - if config["grad_norm_clipping"] is not None: - self.grads_and_vars = _minimize_and_clip( - optimizer, weighted_error, var_list=q_func_vars, - clip_val=config["grad_norm_clipping"]) - else: - self.grads_and_vars = optimizer.compute_gradients( - weighted_error, var_list=q_func_vars) - self.grads_and_vars = [ - (g, v) for (g, v) in self.grads_and_vars if g is not None] - self.grads = [g for (g, v) in self.grads_and_vars] - self.train_expr = optimizer.apply_gradients(self.grads_and_vars) - - # update_target_fn will be called periodically to copy Q network to - # target Q network - update_target_expr = [] - for var, var_target in zip( - sorted(q_func_vars, key=lambda v: v.name), - sorted(target_q_func_vars, key=lambda v: v.name)): - update_target_expr.append(var_target.assign(var)) - self.update_target_expr = tf.group(*update_target_expr) - - def update_target(self, sess): - return sess.run(self.update_target_expr) - - def act(self, sess, obs, eps, stochastic=True): - return sess.run( - self.output_actions, - feed_dict={ - self.cur_observations: obs, - self.stochastic: stochastic, - self.eps: eps, - }) - - def compute_gradients( - self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - td_err, grads = sess.run( - [self.td_error, self.grads], - feed_dict={ - self.obs_t: obs_t, - self.act_t: act_t, - self.rew_t: rew_t, - self.obs_tp1: obs_tp1, - self.done_mask: done_mask, - self.importance_weights: importance_weights - }) - return td_err, grads - - def compute_td_error( - self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - td_err = sess.run( - self.td_error, - feed_dict={ - self.obs_t: [np.array(ob) for ob in obs_t], - self.act_t: act_t, - self.rew_t: rew_t, - self.obs_tp1: [np.array(ob) for ob in obs_tp1], - self.done_mask: done_mask, - self.importance_weights: importance_weights - }) - return td_err - - def apply_gradients(self, sess, grads): - assert len(grads) == len(self.grads_and_vars) - feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)} - sess.run(self.train_expr, feed_dict=feed_dict) - - def compute_apply( - self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - td_err, _ = sess.run( - [self.td_error, self.train_expr], - feed_dict={ - self.obs_t: obs_t, - self.act_t: act_t, - self.rew_t: rew_t, - self.obs_tp1: obs_tp1, - self.done_mask: done_mask, - self.importance_weights: importance_weights - }) - return td_err diff --git a/python/ray/rllib/models/lstm.py b/python/ray/rllib/models/lstm.py index 1d950506b0b7..14d8a9371090 100644 --- a/python/ray/rllib/models/lstm.py +++ b/python/ray/rllib/models/lstm.py @@ -35,8 +35,8 @@ def _init(self, inputs, num_outputs, options): lstm = rnn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True) step_size = tf.shape(self.x)[:1] - c_init = np.zeros((1, lstm.state_size.c), np.float32) - h_init = np.zeros((1, lstm.state_size.h), np.float32) + c_init = np.zeros(lstm.state_size.c, np.float32) + h_init = np.zeros(lstm.state_size.h, np.float32) self.state_init = [c_init, h_init] c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c]) h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h]) diff --git a/python/ray/rllib/models/pytorch/misc.py b/python/ray/rllib/models/pytorch/misc.py index dc725265cf87..cd54fc04b178 100644 --- a/python/ray/rllib/models/pytorch/misc.py +++ b/python/ray/rllib/models/pytorch/misc.py @@ -7,18 +7,14 @@ import torch -def convert_batch(trajectory, has_features=False): +def convert_batch(trajectory): """Convert trajectory from numpy to PT variable""" states = torch.from_numpy(trajectory["obs"]).float() acs = torch.from_numpy(trajectory["actions"]) advs = torch.from_numpy( trajectory["advantages"].copy()).float().reshape(-1) rs = torch.from_numpy(trajectory["rewards"]).float().reshape(-1) - if has_features: - features = [torch.from_numpy(f) for f in trajectory["features"]] - else: - features = trajectory["features"] - return states, acs, advs, rs, features + return states, acs, advs, rs def var_to_np(var): diff --git a/python/ray/rllib/optimizers/multi_gpu_impl.py b/python/ray/rllib/optimizers/multi_gpu_impl.py index 1ff6bff3f05f..a7703eb46bd5 100644 --- a/python/ray/rllib/optimizers/multi_gpu_impl.py +++ b/python/ray/rllib/optimizers/multi_gpu_impl.py @@ -43,7 +43,7 @@ class LocalSyncParallelOptimizer(object): processed. build_loss: Function that takes the specified inputs and returns an object with a 'loss' property that is a scalar Tensor. For example, - ray.rllib.ppo.ProximalPolicyLoss. + ray.rllib.ppo.ProximalPolicyGraph. logdir: Directory to place debugging output in. grad_norm_clipping: None or int stdev to clip grad norms by """ diff --git a/python/ray/rllib/optimizers/policy_optimizer.py b/python/ray/rllib/optimizers/policy_optimizer.py index 1e31edc66ea1..9f705af98394 100644 --- a/python/ray/rllib/optimizers/policy_optimizer.py +++ b/python/ray/rllib/optimizers/policy_optimizer.py @@ -38,18 +38,24 @@ def make( Args: evaluator_cls (class): Python class of the evaluators to create. - evaluator_args (list): List of constructor args for the evaluators. + evaluator_args (list|dict): Constructor args for the evaluators. num_workers (int): Number of remote evaluators to create in addition to a local evaluator. This can be zero or greater. optimizer_config (dict): Keyword arguments to pass to the optimizer class constructor. """ - local_evaluator = evaluator_cls(*evaluator_args) remote_cls = ray.remote(**evaluator_resources)(evaluator_cls) - remote_evaluators = [ - remote_cls.remote(*evaluator_args) - for _ in range(num_workers)] + if isinstance(evaluator_args, list): + local_evaluator = evaluator_cls(*evaluator_args) + remote_evaluators = [ + remote_cls.remote(*evaluator_args) + for _ in range(num_workers)] + else: + local_evaluator = evaluator_cls(**evaluator_args) + remote_evaluators = [ + remote_cls.remote(**evaluator_args) + for _ in range(num_workers)] return cls(optimizer_config, local_evaluator, remote_evaluators) def __init__(self, config, local_evaluator, remote_evaluators): diff --git a/python/ray/rllib/optimizers/sample_batch.py b/python/ray/rllib/optimizers/sample_batch.py index 5e5e1e95b0b3..35f8eec254d4 100644 --- a/python/ray/rllib/optimizers/sample_batch.py +++ b/python/ray/rllib/optimizers/sample_batch.py @@ -2,17 +2,22 @@ from __future__ import division from __future__ import print_function +import collections import numpy as np -def arrayify(s): - if type(s) in [int, float, str, np.ndarray]: - return s - elif type(s) is list: - # recursive call to convert LazyFrames to arrays - return np.array([arrayify(x) for x in s]) - else: - return np.array(s) +class SampleBatchBuilder(object): + """Util to build a SampleBatch incrementally.""" + + def __init__(self): + self.buffers = collections.defaultdict(list) + + def add_values(self, **values): + for k, v in values.items(): + self.buffers[k].append(v) + + def build(self): + return SampleBatch({k: np.array(v) for k, v in self.buffers.items()}) class SampleBatch(object): diff --git a/python/ray/rllib/pg/pg.py b/python/ray/rllib/pg/pg.py index c3726f89f504..0e8912ff036b 100644 --- a/python/ray/rllib/pg/pg.py +++ b/python/ray/rllib/pg/pg.py @@ -2,13 +2,11 @@ from __future__ import division from __future__ import print_function -import numpy as np - -import ray -from ray.rllib.optimizers import LocalSyncOptimizer -from ray.rllib.pg.pg_evaluator import PGEvaluator from ray.rllib.agent import Agent -from ray.tune.result import TrainingResult +from ray.rllib.optimizers import LocalSyncOptimizer +from ray.rllib.pg.pg_policy_graph import PGPolicyGraph +from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \ + collect_metrics from ray.tune.trial import Resources @@ -33,7 +31,6 @@ class PGAgent(Agent): - """Simple policy gradient agent. This is an example agent to show how to implement algorithms in RLlib. @@ -50,34 +47,28 @@ def default_resource_request(cls, config): def _init(self): self.optimizer = LocalSyncOptimizer.make( - evaluator_cls=PGEvaluator, - evaluator_args=[self.registry, self.env_creator, self.config], + evaluator_cls=CommonPolicyEvaluator, + evaluator_args={ + "env_creator": self.env_creator, + "policy_graph": PGPolicyGraph, + "batch_steps": self.config["batch_size"], + "batch_mode": "truncate_episodes", + "registry": self.registry, + "model_config": self.config["model"], + "env_config": self.config["env_config"], + "policy_config": self.config, + }, num_workers=self.config["num_workers"], optimizer_config=self.config["optimizer"]) def _train(self): self.optimizer.step() - - episode_rewards = [] - episode_lengths = [] - metric_lists = [a.get_completed_rollout_metrics.remote() - for a in self.optimizer.remote_evaluators] - for metrics in metric_lists: - for episode in ray.get(metrics): - episode_lengths.append(episode.episode_length) - episode_rewards.append(episode.episode_reward) - avg_reward = np.mean(episode_rewards) - avg_length = np.mean(episode_lengths) - timesteps = np.sum(episode_lengths) - - result = TrainingResult( - episode_reward_mean=avg_reward, - episode_len_mean=avg_length, - timesteps_this_iter=timesteps, - info={}) - - return result - - def compute_action(self, obs): - action, info = self.optimizer.local_evaluator.policy.compute(obs) - return action + return collect_metrics( + self.optimizer.local_evaluator, self.optimizer.remote_evaluators) + + def compute_action(self, observation, state=None): + if state is None: + state = [] + return self.local_evaluator.for_policy( + lambda p: p.compute_single_action( + observation, state, is_training=False)[0]) diff --git a/python/ray/rllib/pg/pg_evaluator.py b/python/ray/rllib/pg/pg_evaluator.py deleted file mode 100644 index 1f217ba02855..000000000000 --- a/python/ray/rllib/pg/pg_evaluator.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.optimizers import PolicyEvaluator -from ray.rllib.pg.policy import PGPolicy -from ray.rllib.utils.filter import NoFilter -from ray.rllib.utils.process_rollout import process_rollout -from ray.rllib.utils.sampler import SyncSampler - - -class PGEvaluator(PolicyEvaluator): - """Evaluator for simple policy gradient.""" - - def __init__(self, registry, env_creator, config): - self.env = ModelCatalog.get_preprocessor_as_wrapper( - registry, env_creator(config["env_config"]), config["model"]) - self.config = config - - self.policy = PGPolicy(registry, self.env.observation_space, - self.env.action_space, config) - self.sampler = SyncSampler( - self.env, self.policy, NoFilter(), - config["batch_size"], horizon=config["horizon"]) - - def sample(self): - rollout = self.sampler.get_data() - samples = process_rollout( - rollout, NoFilter(), - gamma=self.config["gamma"], use_gae=False) - return samples - - def get_completed_rollout_metrics(self): - """Returns metrics on previously completed rollouts. - - Calling this clears the queue of completed rollout metrics. - """ - return self.sampler.get_metrics() - - def compute_gradients(self, samples): - """ Returns gradient w.r.t. samples.""" - gradient, info = self.policy.compute_gradients(samples) - return gradient, {} - - def apply_gradients(self, grads): - """Applies gradients to evaluator weights.""" - self.policy.apply_gradients(grads) - - def get_weights(self): - """Returns model weights.""" - return self.policy.get_weights() - - def set_weights(self, weights): - """Sets model weights.""" - return self.policy.set_weights(weights) diff --git a/python/ray/rllib/pg/pg_policy_graph.py b/python/ray/rllib/pg/pg_policy_graph.py new file mode 100644 index 000000000000..b605a513f39c --- /dev/null +++ b/python/ray/rllib/pg/pg_policy_graph.py @@ -0,0 +1,45 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.utils.process_rollout import compute_advantages +from ray.rllib.utils.tf_policy_graph import TFPolicyGraph + + +class PGPolicyGraph(TFPolicyGraph): + + def __init__(self, obs_space, action_space, registry, config): + self.config = config + + # setup policy + self.x = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape)) + dist_class, self.logit_dim = ModelCatalog.get_action_dist(action_space) + self.model = ModelCatalog.get_model( + registry, self.x, self.logit_dim, options=self.config["model"]) + self.dist = dist_class(self.model.outputs) # logit for each action + + # setup policy loss + self.ac = ModelCatalog.get_action_placeholder(action_space) + self.adv = tf.placeholder(tf.float32, [None], name="adv") + self.loss = -tf.reduce_mean(self.dist.logp(self.ac) * self.adv) + + # initialize TFPolicyGraph + self.sess = tf.get_default_session() + self.loss_in = [ + ("obs", self.x), + ("actions", self.ac), + ("advantages", self.adv), + ] + self.is_training = tf.placeholder_with_default(True, ()) + TFPolicyGraph.__init__( + self, self.sess, obs_input=self.x, + action_sampler=self.dist.sample(), loss=self.loss, + loss_inputs=self.loss_in, is_training=self.is_training) + self.sess.run(tf.global_variables_initializer()) + + def postprocess_trajectory(self, sample_batch, other_agent_batches=None): + return compute_advantages( + sample_batch, 0.0, self.config["gamma"], use_gae=False) diff --git a/python/ray/rllib/pg/policy.py b/python/ray/rllib/pg/policy.py deleted file mode 100644 index cc53eebcbd84..000000000000 --- a/python/ray/rllib/pg/policy.py +++ /dev/null @@ -1,82 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -import ray -from ray.rllib.models.catalog import ModelCatalog - - -class PGPolicy(): - - other_output = [] - is_recurrent = False - - def __init__(self, registry, ob_space, ac_space, config): - self.config = config - self.registry = registry - with tf.variable_scope("local"): - self._setup_graph(ob_space, ac_space) - print("Setting up loss") - self._setup_loss(ac_space) - self._setup_gradients() - self.initialize() - - def _setup_graph(self, ob_space, ac_space): - self.x = tf.placeholder(tf.float32, shape=[None]+list(ob_space.shape)) - dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) - self.model = ModelCatalog.get_model( - self.registry, self.x, self.logit_dim, - options=self.config["model"]) - self.action_logits = self.model.outputs # logit for each action - self.dist = dist_class(self.action_logits) - self.sample = self.dist.sample() - self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, - tf.get_variable_scope().name) - - def _setup_loss(self, action_space): - self.ac = ModelCatalog.get_action_placeholder(action_space) - self.adv = tf.placeholder(tf.float32, [None], name="adv") - - log_prob = self.dist.logp(self.ac) - - # policy loss - self.loss = -tf.reduce_mean(log_prob * self.adv) - - def _setup_gradients(self): - self.grads = tf.gradients(self.loss, self.var_list) - grads_and_vars = list(zip(self.grads, self.var_list)) - opt = tf.train.AdamOptimizer(self.config["lr"]) - self._apply_gradients = opt.apply_gradients(grads_and_vars) - - def initialize(self): - self.sess = tf.Session() - self.variables = ray.experimental.TensorFlowVariables( - self.loss, self.sess) - self.sess.run(tf.global_variables_initializer()) - - def compute_gradients(self, samples): - info = {} - feed_dict = { - self.x: samples["obs"], - self.ac: samples["actions"], - self.adv: samples["advantages"], - } - self.grads = [g for g in self.grads if g is not None] - grad = self.sess.run(self.grads, feed_dict=feed_dict) - return grad, info - - def apply_gradients(self, grads): - feed_dict = dict(zip(self.grads, grads)) - self.sess.run(self._apply_gradients, feed_dict=feed_dict) - - def get_weights(self): - return self.variables.get_weights() - - def set_weights(self, weights): - self.variables.set_weights(weights) - - def compute(self, ob, *args): - action = self.sess.run(self.sample, {self.x: [ob]}) - return action[0], {} diff --git a/python/ray/rllib/ppo/loss.py b/python/ray/rllib/ppo/loss.py index 3f69ff711692..f57502d18b40 100644 --- a/python/ray/rllib/ppo/loss.py +++ b/python/ray/rllib/ppo/loss.py @@ -7,7 +7,7 @@ from ray.rllib.models import ModelCatalog -class ProximalPolicyLoss(object): +class ProximalPolicyGraph(object): other_output = ["vf_preds", "logprobs"] is_recurrent = False @@ -82,11 +82,14 @@ def __init__( self.policy_results = [ self.sampler, self.curr_logits, tf.constant("NA")] - def compute(self, observation): + def compute_single_action(self, observation, features, is_training=False): action, logprobs, vf = self.sess.run( self.policy_results, feed_dict={self.observations: [observation]}) - return action[0], {"vf_preds": vf[0], "logprobs": logprobs[0]} + return action[0], [], {"vf_preds": vf[0], "logprobs": logprobs[0]} + + def get_initial_state(self): + return [] def loss(self): return self.loss diff --git a/python/ray/rllib/ppo/ppo.py b/python/ray/rllib/ppo/ppo.py index a8c695033e9a..2e82b5086535 100644 --- a/python/ray/rllib/ppo/ppo.py +++ b/python/ray/rllib/ppo/ppo.py @@ -172,7 +172,7 @@ def standardized(value): batch_index = 0 num_batches = ( int(tuples_per_device) // int(model.per_device_batch_size)) - loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], [] + loss, policy_graph, vf_loss, kl, entropy = [], [], [], [], [] permutation = np.random.permutation(num_batches) # Prepare to drop into the debugger if self.iteration == config["tf_debug_iteration"]: @@ -181,26 +181,26 @@ def standardized(value): full_trace = ( i == 0 and self.iteration == 0 and batch_index == config["full_trace_nth_sgd_batch"]) - batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \ + batch_loss, batch_policy_graph, batch_vf_loss, batch_kl, \ batch_entropy = model.run_sgd_minibatch( permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, full_trace, self.file_writer) loss.append(batch_loss) - policy_loss.append(batch_policy_loss) + policy_graph.append(batch_policy_graph) vf_loss.append(batch_vf_loss) kl.append(batch_kl) entropy.append(batch_entropy) batch_index += 1 loss = np.mean(loss) - policy_loss = np.mean(policy_loss) + policy_graph = np.mean(policy_graph) vf_loss = np.mean(vf_loss) kl = np.mean(kl) entropy = np.mean(entropy) sgd_end = time.time() print( "{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format( - i, loss, policy_loss, vf_loss, kl, entropy)) + i, loss, policy_graph, vf_loss, kl, entropy)) values = [] if i == config["num_sgd_iter"] - 1: @@ -299,4 +299,5 @@ def _restore(self, checkpoint_path): def compute_action(self, observation): observation = self.local_evaluator.obs_filter( observation, update=False) - return self.local_evaluator.common_policy.compute(observation)[0] + return self.local_evaluator.common_policy.compute_single_action( + observation, [], False)[0] diff --git a/python/ray/rllib/ppo/ppo_evaluator.py b/python/ray/rllib/ppo/ppo_evaluator.py index 434feb094d7e..a8ca6e54ca92 100644 --- a/python/ray/rllib/ppo/ppo_evaluator.py +++ b/python/ray/rllib/ppo/ppo_evaluator.py @@ -16,8 +16,8 @@ from ray.rllib.models import ModelCatalog from ray.rllib.utils.sampler import SyncSampler from ray.rllib.utils.filter import get_filter, MeanStdFilter -from ray.rllib.utils.process_rollout import process_rollout -from ray.rllib.ppo.loss import ProximalPolicyLoss +from ray.rllib.utils.process_rollout import compute_advantages +from ray.rllib.ppo.loss import ProximalPolicyGraph # TODO(rliaw): Move this onto LocalMultiGPUOptimizer @@ -86,7 +86,7 @@ def __init__(self, registry, env_creator, config, logdir, is_remote): self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): - return ProximalPolicyLoss( + return ProximalPolicyGraph( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, @@ -190,8 +190,9 @@ def sample(self): while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() - samples = process_rollout( - rollout, self.rew_filter, self.config["gamma"], + last_r = 0.0 # note: not needed since we don't truncate rollouts + samples = compute_advantages( + rollout, last_r, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) diff --git a/python/ray/rllib/test/test_checkpoint_restore.py b/python/ray/rllib/test/test_checkpoint_restore.py index 9e583c877bb9..68eeb27ea19e 100644 --- a/python/ray/rllib/test/test_checkpoint_restore.py +++ b/python/ray/rllib/test/test_checkpoint_restore.py @@ -17,18 +17,19 @@ def get_mean_action(alg, obs): return np.mean(out) -ray.init() +ray.init(num_cpus=10) CONFIGS = { - "ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100}, + "ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100, + "num_workers": 2}, "DQN": {}, - "DDPG": {"noise_scale": 0.0}, - "PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000}, - "A3C": {"use_lstm": False}, + "DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100}, + "PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2}, + "A3C": {"use_lstm": False, "num_workers": 1}, } -def test(use_object_store, alg_name): +def test(use_object_store, alg_name, failures): cls = get_agent_class(alg_name) if alg_name == "DDPG": alg1 = cls(config=CONFIGS[name], env="Pendulum-v0") @@ -55,12 +56,15 @@ def test(use_object_store, alg_name): a1 = get_mean_action(alg1, obs) a2 = get_mean_action(alg2, obs) print("Checking computed actions", alg1, obs, a1, a2) - assert abs(a1 - a2) < .1, (a1, a2) + if abs(a1 - a2) > .1: + failures.append((alg_name, [a1, a2])) if __name__ == "__main__": + failures = [] for use_object_store in [False, True]: for name in ["ES", "DQN", "DDPG", "PPO", "A3C"]: - test(use_object_store, name) + test(use_object_store, name, failures) + assert not failures, failures print("All checkpoint restore tests passed!") diff --git a/python/ray/rllib/test/test_common_policy_evaluator.py b/python/ray/rllib/test/test_common_policy_evaluator.py new file mode 100644 index 000000000000..9e70d2f812ca --- /dev/null +++ b/python/ray/rllib/test/test_common_policy_evaluator.py @@ -0,0 +1,133 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import gym +import time +import unittest + +import ray +from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator +from ray.rllib.utils.policy_graph import PolicyGraph +from ray.rllib.utils.process_rollout import compute_advantages + + +class MockPolicyGraph(PolicyGraph): + def compute_actions(self, obs_batch, state_batches, is_training=False): + return [0] * len(obs_batch), [], {} + + def postprocess_trajectory(self, batch): + return compute_advantages(batch, 100.0, 0.9, use_gae=False) + + +class TestCommonPolicyEvaluator(unittest.TestCase): + def testBasic(self): + ev = CommonPolicyEvaluator( + env_creator=lambda _: gym.make("CartPole-v0"), + policy_graph=MockPolicyGraph) + batch = ev.sample() + for key in ["obs", "actions", "rewards", "dones", "advantages"]: + self.assertIn(key, batch) + self.assertGreater(batch["advantages"][0], 1) + + def testPackEpisodes(self): + for batch_size in [1, 10, 100, 1000]: + ev = CommonPolicyEvaluator( + env_creator=lambda _: gym.make("CartPole-v0"), + policy_graph=MockPolicyGraph, + batch_steps=batch_size, + batch_mode="pack_episodes") + batch = ev.sample() + self.assertEqual(batch.count, batch_size) + + def testTruncateEpisodes(self): + ev = CommonPolicyEvaluator( + env_creator=lambda _: gym.make("CartPole-v0"), + policy_graph=MockPolicyGraph, + batch_steps=2, + batch_mode="truncate_episodes") + batch = ev.sample() + self.assertEqual(batch.count, 2) + ev = CommonPolicyEvaluator( + env_creator=lambda _: gym.make("CartPole-v0"), + policy_graph=MockPolicyGraph, + batch_steps=1000, + batch_mode="truncate_episodes") + self.assertLess(batch.count, 200) + + def testCompleteEpisodes(self): + ev = CommonPolicyEvaluator( + env_creator=lambda _: gym.make("CartPole-v0"), + policy_graph=MockPolicyGraph, + batch_steps=2, + batch_mode="complete_episodes") + batch = ev.sample() + self.assertGreater(batch.count, 2) + self.assertTrue(batch["dones"][-1]) + batch = ev.sample() + self.assertGreater(batch.count, 2) + self.assertTrue(batch["dones"][-1]) + + def testFilterSync(self): + ev = CommonPolicyEvaluator( + env_creator=lambda _: gym.make("CartPole-v0"), + policy_graph=MockPolicyGraph, + sample_async=True, + observation_filter="ConcurrentMeanStdFilter") + time.sleep(2) + ev.sample() + filters = ev.get_filters(flush_after=True) + obs_f = filters["obs_filter"] + self.assertNotEqual(obs_f.rs.n, 0) + self.assertNotEqual(obs_f.buffer.n, 0) + + def testGetFilters(self): + ev = CommonPolicyEvaluator( + env_creator=lambda _: gym.make("CartPole-v0"), + policy_graph=MockPolicyGraph, + sample_async=True, + observation_filter="ConcurrentMeanStdFilter") + self.sample_and_flush(ev) + filters = ev.get_filters(flush_after=False) + time.sleep(2) + filters2 = ev.get_filters(flush_after=False) + obs_f = filters["obs_filter"] + obs_f2 = filters2["obs_filter"] + self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n) + self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n) + + def testSyncFilter(self): + ev = CommonPolicyEvaluator( + env_creator=lambda _: gym.make("CartPole-v0"), + policy_graph=MockPolicyGraph, + sample_async=True, + observation_filter="ConcurrentMeanStdFilter") + obs_f = self.sample_and_flush(ev) + + # Current State + filters = ev.get_filters(flush_after=False) + obs_f = filters["obs_filter"] + + self.assertLessEqual(obs_f.buffer.n, 20) + + new_obsf = obs_f.copy() + new_obsf.rs._n = 100 + ev.sync_filters({"obs_filter": new_obsf}) + filters = ev.get_filters(flush_after=False) + obs_f = filters["obs_filter"] + self.assertGreaterEqual(obs_f.rs.n, 100) + self.assertLessEqual(obs_f.buffer.n, 20) + + def sample_and_flush(self, ev): + time.sleep(2) + ev.sample() + filters = ev.get_filters(flush_after=True) + obs_f = filters["obs_filter"] + self.assertNotEqual(obs_f.rs.n, 0) + self.assertNotEqual(obs_f.buffer.n, 0) + return obs_f + + +if __name__ == '__main__': + ray.init() + unittest.main(verbosity=2) diff --git a/python/ray/rllib/test/test_evaluators.py b/python/ray/rllib/test/test_evaluators.py index 29c054a0d418..d2abf1e6d65f 100644 --- a/python/ray/rllib/test/test_evaluators.py +++ b/python/ray/rllib/test/test_evaluators.py @@ -3,19 +3,11 @@ from __future__ import print_function import unittest -import gym -import shutil -import tempfile -import time -import ray -from ray.rllib.a3c import DEFAULT_CONFIG -from ray.rllib.a3c.a3c_evaluator import A3CEvaluator -from ray.rllib.dqn.dqn_evaluator import adjust_nstep -from ray.tune.registry import get_registry +from ray.rllib.dqn.dqn_policy_graph import adjust_nstep -class DQNEvaluatorTest(unittest.TestCase): +class DQNTest(unittest.TestCase): def testNStep(self): obs = [1, 2, 3, 4, 5, 6, 7] actions = ["a", "b", "a", "a", "a", "b", "a"] @@ -30,70 +22,5 @@ def testNStep(self): self.assertEqual(dones, [1, 0, 0, 0, 0]) -class A3CEvaluatorTest(unittest.TestCase): - - def setUp(self): - ray.init(num_cpus=1) - config = DEFAULT_CONFIG.copy() - config["num_workers"] = 1 - config["observation_filter"] = "ConcurrentMeanStdFilter" - config["reward_filter"] = "MeanStdFilter" - config["batch_size"] = 2 - self._temp_dir = tempfile.mkdtemp("a3c_evaluator_test") - self.e = A3CEvaluator( - get_registry(), - lambda config: gym.make("CartPole-v0"), - config, - logdir=self._temp_dir) - - def tearDown(self): - ray.worker.cleanup() - shutil.rmtree(self._temp_dir) - - def sample_and_flush(self): - e = self.e - time.sleep(2) - self.e.sample() - filters = e.get_filters(flush_after=True) - obs_f = filters["obs_filter"] - rew_f = filters["rew_filter"] - self.assertNotEqual(obs_f.rs.n, 0) - self.assertNotEqual(obs_f.buffer.n, 0) - self.assertNotEqual(rew_f.rs.n, 0) - self.assertNotEqual(rew_f.buffer.n, 0) - return obs_f, rew_f - - def testGetFilters(self): - """Show `flush_after=False` provides does not affect the buffer.""" - e = self.e - self.sample_and_flush() - filters = e.get_filters(flush_after=False) - obs_f = filters["obs_filter"] - filters2 = e.get_filters(flush_after=False) - obs_f2 = filters2["obs_filter"] - self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n) - self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n) - - def testSyncFilter(self): - """Show that sync_filters rebases own buffer over input""" - e = self.e - obs_f, _ = self.sample_and_flush() - - # Current State - filters = e.get_filters(flush_after=False) - obs_f = filters["obs_filter"] - rew_f = filters["rew_filter"] - - self.assertLessEqual(obs_f.buffer.n, 20) - - new_obsf = obs_f.copy() - new_obsf.rs._n = 100 - e.sync_filters({"obs_filter": new_obsf, "rew_filter": rew_f}) - filters = e.get_filters(flush_after=False) - obs_f = filters["obs_filter"] - self.assertGreaterEqual(obs_f.rs.n, 100) - self.assertLessEqual(obs_f.buffer.n, 20) - - if __name__ == '__main__': unittest.main(verbosity=2) diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py index 2e41c85a0233..bf3124002a9f 100644 --- a/python/ray/rllib/test/test_supported_spaces.py +++ b/python/ray/rllib/test/test_supported_spaces.py @@ -36,32 +36,6 @@ Box(0.0, 1.0, (5,), dtype=np.float32)]), } -# (alg, action_space, obs_space) -KNOWN_FAILURES = [ - # TODO(ekl) multiagent support for a3c - ("A3C", "implicit_tuple", "atari"), - ("A3C", "implicit_tuple", "atari_ram"), - ("A3C", "implicit_tuple", "discrete"), - ("A3C", "implicit_tuple", "image"), - ("A3C", "implicit_tuple", "mixed_tuple"), - ("A3C", "implicit_tuple", "simple_tuple"), - ("A3C", "implicit_tuple", "vector"), - ("A3C", "mixed_tuple", "atari"), - ("A3C", "mixed_tuple", "atari_ram"), - ("A3C", "mixed_tuple", "discrete"), - ("A3C", "mixed_tuple", "image"), - ("A3C", "mixed_tuple", "mixed_tuple"), - ("A3C", "mixed_tuple", "simple_tuple"), - ("A3C", "mixed_tuple", "vector"), - ("A3C", "simple_tuple", "atari"), - ("A3C", "simple_tuple", "atari_ram"), - ("A3C", "simple_tuple", "discrete"), - ("A3C", "simple_tuple", "image"), - ("A3C", "simple_tuple", "mixed_tuple"), - ("A3C", "simple_tuple", "simple_tuple"), - ("A3C", "simple_tuple", "vector"), -] - def make_stub_env(action_space, obs_space): class StubEnv(gym.Env): @@ -135,19 +109,13 @@ def testAll(self): {"num_workers": 1, "optimizer": {}}, stats) num_unexpected_errors = 0 - num_unexpected_success = 0 for (alg, a_name, o_name), stat in sorted(stats.items()): - if stat in ["ok", "unsupported"]: - if (alg, a_name, o_name) in KNOWN_FAILURES: - num_unexpected_success += 1 - else: - if (alg, a_name, o_name) not in KNOWN_FAILURES: - num_unexpected_errors += 1 + if stat not in ["ok", "unsupported"]: + num_unexpected_errors += 1 print( alg, "action_space", a_name, "obs_space", o_name, "result", stat) self.assertEqual(num_unexpected_errors, 0) - self.assertEqual(num_unexpected_success, 0) if __name__ == "__main__": diff --git a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml index 0a330bb5b57b..7c0f660fd8a6 100644 --- a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml @@ -13,7 +13,6 @@ mountaincarcontinuous-ddpg: tau: 0.01 l2_reg: 0.00001 buffer_size: 50000 - random_starts: False clip_rewards: False learning_starts: 1000 #model: diff --git a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml index 2166989d8080..baccb42b8b87 100644 --- a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml @@ -6,6 +6,5 @@ pendulum-ddpg: episode_reward_mean: -160 config: use_huber: True - random_starts: False clip_rewards: False exploration_fraction: 0.1 diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c-pytorch.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c-pytorch.yaml new file mode 100644 index 000000000000..a25da3c7769a --- /dev/null +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c-pytorch.yaml @@ -0,0 +1,10 @@ +cartpole-a3c: + env: CartPole-v0 + run: A3C + stop: + episode_reward_mean: 200 + time_total_s: 600 + config: + num_workers: 1 + gamma: 0.95 + use_pytorch: true diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml index 6850a665e4f2..f20ea73c3b68 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml @@ -5,5 +5,5 @@ cartpole-a3c: episode_reward_mean: 200 time_total_s: 600 config: - num_workers: 4 + num_workers: 1 gamma: 0.95 diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml index 7efde08f8f2f..04aa2dc6edcc 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml @@ -7,4 +7,3 @@ cartpole-dqn: config: n_step: 3 gamma: 0.95 - smoothing_num_episodes: 10 diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-pg.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-pg.yaml new file mode 100644 index 000000000000..2bf9e7548b86 --- /dev/null +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-pg.yaml @@ -0,0 +1,8 @@ +cartpole-pg: + env: CartPole-v0 + run: PG + stop: + episode_reward_mean: 200 + time_total_s: 300 + config: + num_workers: 1 diff --git a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml index 840f6d963c4e..124f756ecc1c 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml @@ -6,7 +6,5 @@ pendulum-ddpg: time_total_s: 900 config: use_huber: True - random_starts: False clip_rewards: False exploration_fraction: 0.1 - smoothing_num_episodes: 10 diff --git a/python/ray/rllib/utils/common_policy_evaluator.py b/python/ray/rllib/utils/common_policy_evaluator.py new file mode 100644 index 000000000000..d86c508cde4f --- /dev/null +++ b/python/ray/rllib/utils/common_policy_evaluator.py @@ -0,0 +1,278 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import pickle +import numpy as np +import tensorflow as tf + +import ray +from ray.rllib.models import ModelCatalog +from ray.rllib.optimizers.policy_evaluator import PolicyEvaluator +from ray.rllib.utils.atari_wrappers import wrap_deepmind +from ray.rllib.utils.compression import pack +from ray.rllib.utils.filter import get_filter +from ray.rllib.utils.sampler import AsyncSampler, SyncSampler +from ray.rllib.utils.tf_policy_graph import TFPolicyGraph +from ray.tune.registry import get_registry +from ray.tune.result import TrainingResult + + +def collect_metrics(local_evaluator, remote_evaluators): + """Gathers episode metrics from CommonPolicyEvaluator instances.""" + + episode_rewards = [] + episode_lengths = [] + metric_lists = ray.get( + [a.apply.remote(lambda ev: ev.sampler.get_metrics()) + for a in remote_evaluators]) + metric_lists.append(local_evaluator.sampler.get_metrics()) + for metrics in metric_lists: + for episode in metrics: + episode_lengths.append(episode.episode_length) + episode_rewards.append(episode.episode_reward) + if episode_rewards: + min_reward = min(episode_rewards) + max_reward = max(episode_rewards) + else: + min_reward = float('nan') + max_reward = float('nan') + avg_reward = np.mean(episode_rewards) + avg_length = np.mean(episode_lengths) + timesteps = np.sum(episode_lengths) + + return TrainingResult( + episode_reward_max=max_reward, + episode_reward_min=min_reward, + episode_reward_mean=avg_reward, + episode_len_mean=avg_length, + episodes_total=len(episode_lengths), + timesteps_this_iter=timesteps) + + +class CommonPolicyEvaluator(PolicyEvaluator): + """Policy evaluator implementation that operates on a rllib.PolicyGraph. + + TODO: vector env + TODO: multi-agent + TODO: consumer buffering for multi-agent + TODO: complete episode batch mode + + Examples: + # Create a policy evaluator and using it to collect experiences. + >>> evaluator = CommonPolicyEvaluator( + env_creator=lambda _: gym.make("CartPole-v0"), + policy_graph=PGPolicyGraph) + >>> print(evaluator.sample().keys()) + {"obs": [[...]], "actions": [[...]], "rewards": [[...]], + "dones": [[...]], "new_obs": [[...]]} + + # Creating policy evaluators using optimizer_cls.make(). + >>> optimizer = LocalSyncOptimizer.make( + evaluator_cls=CommonPolicyEvaluator, + evaluator_args={ + "env_creator": lambda _: gym.make("CartPole-v0"), + "policy_graph": PGPolicyGraph, + }, + num_workers=10) + >>> for _ in range(10): optimizer.step() + """ + + @classmethod + def as_remote(cls, num_cpus=None, num_gpus=None): + return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls) + + def __init__( + self, + env_creator, + policy_graph, + tf_session_creator=None, + batch_steps=100, + batch_mode="truncate_episodes", + preprocessor_pref="rllib", + sample_async=False, + compress_observations=False, + observation_filter="NoFilter", + registry=None, + env_config=None, + model_config=None, + policy_config=None): + """Initialize a policy evaluator. + + Arguments: + env_creator (func): Function that returns a gym.Env given an + env config dict. + policy_graph (class): A class implementing rllib.PolicyGraph or + rllib.TFPolicyGraph. + tf_session_creator (func): A function that returns a TF session. + This is optional and only useful with TFPolicyGraph. + batch_steps (int): The target number of env transitions to include + in each sample batch returned from this evaluator. + batch_mode (str): One of the following choices: + complete_episodes: each batch will be at least batch_steps + in size, and will include one or more complete episodes. + truncate_episodes: each batch will be around batch_steps + in size, and include transitions from one episode only. + pack_episodes: each batch will be exactly batch_steps in + size, and may include transitions from multiple episodes. + preprocessor_pref (str): Whether to prefer RLlib preprocessors + ("rllib") or deepmind ("deepmind") when applicable. + sample_async (bool): Whether to compute samples asynchronously in + the background, which improves throughput but can cause samples + to be slightly off-policy. + compress_observations (bool): If true, compress the observations + returned. + observation_filter (str): Name of observation filter to use. + registry (tune.Registry): User-registered objects. Pass in the + value from tune.registry.get_registry() if you're having + trouble resolving things like custom envs. + env_config (dict): Config to pass to the env creator. + model_config (dict): Config to use when creating the policy model. + policy_config (dict): Config to pass to the policy. + """ + + registry = registry or get_registry() + env_config = env_config or {} + policy_config = policy_config or {} + model_config = model_config or {} + + assert batch_mode in [ + "complete_episodes", "truncate_episodes", "pack_episodes"] + self.env_creator = env_creator + self.policy_graph = policy_graph + self.batch_steps = batch_steps + self.batch_mode = batch_mode + self.compress_observations = compress_observations + + self.env = env_creator(env_config) + is_atari = hasattr(self.env.unwrapped, "ale") + if is_atari and "custom_preprocessor" not in model_config and \ + preprocessor_pref == "deepmind": + self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80)) + else: + self.env = ModelCatalog.get_preprocessor_as_wrapper( + registry, self.env, model_config) + + self.vectorized = hasattr(self.env, "vector_reset") + self.policy_map = {} + + if issubclass(policy_graph, TFPolicyGraph): + with tf.Graph().as_default(): + if tf_session_creator: + self.sess = tf_session_creator() + else: + self.sess = tf.Session(config=tf.ConfigProto( + gpu_options=tf.GPUOptions(allow_growth=True))) + with self.sess.as_default(): + policy = policy_graph( + self.env.observation_space, self.env.action_space, + registry, policy_config) + else: + policy = policy_graph( + self.env.observation_space, self.env.action_space, + registry, policy_config) + self.policy_map = { + "default": policy + } + + self.obs_filter = get_filter( + observation_filter, self.env.observation_space.shape) + self.filters = {"obs_filter": self.obs_filter} + + if self.vectorized: + raise NotImplementedError("Vector envs not yet supported") + else: + if batch_mode not in [ + "pack_episodes", "truncate_episodes", "complete_episodes"]: + raise NotImplementedError("Batch mode not yet supported") + pack = batch_mode == "pack_episodes" + if batch_mode == "complete_episodes": + batch_steps = 999999 + if sample_async: + self.sampler = AsyncSampler( + self.env, self.policy_map["default"], self.obs_filter, + batch_steps, pack=pack) + self.sampler.start() + else: + self.sampler = SyncSampler( + self.env, self.policy_map["default"], self.obs_filter, + batch_steps, pack=pack) + + def sample(self): + """Evaluate the current policies and return a batch of experiences. + + Return: + SampleBatch from evaluating the current policies. + """ + + batch = self.policy_map["default"].postprocess_trajectory( + self.sampler.get_data()) + + if self.compress_observations: + batch["obs"] = [pack(o) for o in batch["obs"]] + batch["new_obs"] = [pack(o) for o in batch["new_obs"]] + + return batch + + def apply(self, func): + """Apply the given function to this evaluator instance.""" + + return func(self) + + def for_policy(self, func): + """Apply the given function to this evaluator's default policy.""" + + return func(self.policy_map["default"]) + + def sync_filters(self, new_filters): + """Changes self's filter to given and rebases any accumulated delta. + + Args: + new_filters (dict): Filters with new state to update local copy. + """ + assert all(k in new_filters for k in self.filters) + for k in self.filters: + self.filters[k].sync(new_filters[k]) + + def get_filters(self, flush_after=False): + """Returns a snapshot of filters. + + Args: + flush_after (bool): Clears the filter buffer state. + + Returns: + return_filters (dict): Dict for serializable filters + """ + return_filters = {} + for k, f in self.filters.items(): + return_filters[k] = f.as_serializable() + if flush_after: + f.clear_buffer() + return return_filters + + def get_weights(self): + return self.policy_map["default"].get_weights() + + def set_weights(self, weights): + return self.policy_map["default"].set_weights(weights) + + def compute_gradients(self, samples): + return self.policy_map["default"].compute_gradients(samples) + + def apply_gradients(self, grads): + return self.policy_map["default"].apply_gradients(grads) + + def compute_apply(self, samples): + grad_fetch, apply_fetch = self.policy_map["default"].compute_apply( + samples) + return grad_fetch + + def save(self): + filters = self.get_filters(flush_after=True) + state = self.policy_map["default"].get_state() + return pickle.dumps({"filters": filters, "state": state}) + + def restore(self, objs): + objs = pickle.loads(objs) + self.sync_filters(objs["filters"]) + self.policy_map["default"].set_state(objs["state"]) diff --git a/python/ray/rllib/utils/policy_graph.py b/python/ray/rllib/utils/policy_graph.py new file mode 100644 index 000000000000..ec78e1e5b7f8 --- /dev/null +++ b/python/ray/rllib/utils/policy_graph.py @@ -0,0 +1,132 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +class PolicyGraph(object): + """An agent policy and loss, i.e., a TFPolicyGraph or other subclass. + + This object defines how to act in the environment, and also losses used to + improve the policy based on its experiences. Note that both policy and + loss are defined together for convenience, though the policy itself is + logically separate. + + All policies can directly extend PolicyGraph, however TensorFlow users may + find TFPolicyGraph simpler to implement. TFPolicyGraph also enables RLlib + to apply TensorFlow-specific optimizations such as fusing multiple policy + graphs and multi-GPU support. + """ + + def __init__(self, registry, observation_space, action_space, config): + """Initialize the graph. + + Args: + registry (obj): Object registry for user-defined envs, models, etc. + observation_space (gym.Space): Observation space of the env. + action_space (gym.Space): Action space of the env. + config (dict): Policy-specific configuration data. + """ + pass + + def compute_actions(self, obs_batch, state_batches, is_training=False): + """Compute actions for the current policy. + + Arguments: + obs_batch (np.ndarray): batch of observations + state_batches (list): list of RNN state input batches, if any + is_training (bool): whether we are training the policy + + Returns: + actions (np.ndarray): batch of output actions, with shape like + [BATCH_SIZE, ACTION_SHAPE]. + state_outs (list): list of RNN state output batches, if any, with + shape like [STATE_SIZE, BATCH_SIZE]. + info (dict): dictionary of extra feature batches, if any, with + shape like {"f1": [BATCH_SIZE, ...], "f2": [BATCH_SIZE, ...]}. + """ + raise NotImplementedError + + def compute_single_action(self, obs, state, is_training=False): + """Unbatched version of compute_actions. + + Arguments: + obs (obj): single observation + state_batches (list): list of RNN state inputs, if any + is_training (bool): whether we are training the policy + + Returns: + actions (obj): single action + state_outs (list): list of RNN state outputs, if any + info (dict): dictionary of extra features, if any + """ + + [action], state_out, info = self.compute_actions( + [obs], [[s] for s in state], is_training) + return action, [s[0] for s in state_out], \ + {k: v[0] for k, v in info.items()} + + def postprocess_trajectory(self, sample_batch, other_agent_batches=None): + """Implements algorithm-specific trajectory postprocessing. + + Arguments: + sample_batch (SampleBatch): batch of experiences for the policy + other_agent_batches (dict): In a multi-agent env, this contains the + experience batches seen by other agents. + + Returns: + SampleBatch: postprocessed sample batch. + """ + return sample_batch + + def compute_gradients(self, postprocessed_batch): + """Computes gradients against a batch of experiences. + + Returns: + grads (list): List of gradient output values + info (dict): Extra policy-specific values + """ + raise NotImplementedError + + def apply_gradients(self, gradients): + """Applies previously computed gradients. + + Returns: + info (dict): Extra policy-specific values + """ + raise NotImplementedError + + def get_weights(self): + """Returns model weights. + + Returns: + weights (obj): Serializable copy or view of model weights + """ + raise NotImplementedError + + def set_weights(self, weights): + """Sets model weights. + + Arguments: + weights (obj): Serializable copy or view of model weights + """ + raise NotImplementedError + + def get_initial_state(self): + """Returns initial RNN state for the current policy.""" + return [] + + def get_state(self): + """Saves all local state. + + Returns: + state (obj): Serialized local state. + """ + return self.get_weights() + + def set_state(self, state): + """Restores all local state. + + Arguments: + state (obj): Serialized local state. + """ + self.set_weights(state) diff --git a/python/ray/rllib/utils/process_rollout.py b/python/ray/rllib/utils/process_rollout.py index b2d52fddabb3..ed7088bb18d8 100644 --- a/python/ray/rllib/utils/process_rollout.py +++ b/python/ray/rllib/utils/process_rollout.py @@ -11,12 +11,12 @@ def discount(x, gamma): return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] -def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True): +def compute_advantages(rollout, last_r, gamma, lambda_=1.0, use_gae=True): """Given a rollout, compute its value targets and the advantage. Args: rollout (PartialRollout): Partial Rollout Object - reward_filter (Filter): Filter for processing advantanges + last_r (float): Value estimation for last observation gamma (float): Parameter for GAE lambda_ (float): Parameter for GAE use_gae (bool): Using Generalized Advantage Estamation @@ -32,21 +32,17 @@ def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True): if use_gae: assert "vf_preds" in rollout, "Values not found!" - vpred_t = np.stack(rollout["vf_preds"] + - [np.array(rollout.last_r)]).squeeze() + vpred_t = np.concatenate([rollout["vf_preds"], np.array([last_r])]) delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1] # This formula for the advantage comes # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438 traj["advantages"] = discount(delta_t, gamma * lambda_) traj["value_targets"] = traj["advantages"] + traj["vf_preds"] else: - rewards_plus_v = np.stack(rollout["rewards"] + - [np.array(rollout.last_r)]).squeeze() + rewards_plus_v = np.concatenate( + [rollout["rewards"], np.array([last_r])]) traj["advantages"] = discount(rewards_plus_v, gamma)[:-1] - for i in range(traj["advantages"].shape[0]): - traj["advantages"][i] = reward_filter(traj["advantages"][i]) - traj["advantages"] = traj["advantages"].copy() assert all(val.shape[0] == trajsize for val in traj.values()), \ diff --git a/python/ray/rllib/utils/sampler.py b/python/ray/rllib/utils/sampler.py index 242464dbfe07..4b233adaf298 100644 --- a/python/ray/rllib/utils/sampler.py +++ b/python/ray/rllib/utils/sampler.py @@ -2,80 +2,12 @@ from __future__ import division from __future__ import print_function -import six.moves.queue as queue -import threading from collections import namedtuple import numpy as np +import six.moves.queue as queue +import threading - -class PartialRollout(object): - """A piece of a complete rollout. - - We run our agent, and process its experience once it has processed enough - steps. - - Attributes: - data (dict): Stores rollout data. All numpy arrays other than - `observations` and `features` will be squeezed. - last_r (float): Value of next state. Used for bootstrapping. - """ - - fields = ["obs", "actions", "rewards", "new_obs", "dones", "features"] - - def __init__(self, extra_fields=None): - """Initializers internals. Maintains a `last_r` field - in support of partial rollouts, used in bootstrapping advantage - estimation. - - Args: - extra_fields: Optional field for object to keep track. - """ - if extra_fields: - self.fields.extend(extra_fields) - self.data = {k: [] for k in self.fields} - self.last_r = 0.0 - - def add(self, **kwargs): - for k, v in kwargs.items(): - self.data[k] += [v] - - def extend(self, other_rollout): - """Extends internal data structure. Assumes other_rollout contains - data that occured afterwards.""" - - assert not self.is_terminal() - assert all(k in other_rollout.fields for k in self.fields) - for k, v in other_rollout.data.items(): - self.data[k].extend(v) - self.last_r = other_rollout.last_r - - def is_terminal(self): - """Check if terminal. - - Returns: - terminal (bool): if rollout has terminated.""" - return self.data["dones"][-1] - - def __getitem__(self, key): - return self.data[key] - - def __setitem__(self, key, item): - self.data[key] = item - - def keys(self): - return self.data.keys() - - def items(self): - return self.data.items() - - def __iter__(self): - return self.data.__iter__() - - def __next__(self): - return self.data.__next__() - - def __contains__(self, x): - return x in self.data +from ray.rllib.optimizers.sample_batch import SampleBatchBuilder CompletedRollout = namedtuple("CompletedRollout", @@ -92,7 +24,9 @@ class SyncSampler(object): thread.""" _async = False - def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None): + def __init__( + self, env, policy, obs_filter, num_local_steps, horizon=None, + pack=False): self.num_local_steps = num_local_steps self.horizon = horizon self.env = env @@ -100,7 +34,7 @@ def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None): self._obs_filter = obs_filter self.rollout_provider = _env_runner(self.env, self.policy, self.num_local_steps, self.horizon, - self._obs_filter) + self._obs_filter, pack) self.metrics_queue = queue.Queue() def get_data(self): @@ -128,7 +62,9 @@ class AsyncSampler(threading.Thread): accumulate and the gradient can be calculated on up to 5 batches.""" _async = True - def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None): + def __init__( + self, env, policy, obs_filter, num_local_steps, horizon=None, + pack=False): assert getattr( obs_filter, "is_concurrent", False), ("Observation Filter must support concurrent updates.") @@ -142,6 +78,7 @@ def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None): self._obs_filter = obs_filter self.started = False self.daemon = True + self.pack = pack def run(self): self.started = True @@ -154,7 +91,7 @@ def run(self): def _run(self): rollout_provider = _env_runner(self.env, self.policy, self.num_local_steps, self.horizon, - self._obs_filter) + self._obs_filter, self.pack) while True: # The timeout variable exists because apparently, if one worker # dies, the other workers won't die with it, unless the timeout is @@ -169,18 +106,18 @@ def get_data(self): """Gets currently accumulated data. Returns: - rollout (PartialRollout): trajectory data (unprocessed) + rollout (SampleBatch): trajectory data (unprocessed) """ assert self.started, "Sampler never started running!" rollout = self.queue.get(timeout=600.0) if isinstance(rollout, BaseException): raise rollout - while not rollout.is_terminal(): + while not rollout["dones"][-1]: try: part = self.queue.get_nowait() if isinstance(part, BaseException): raise rollout - rollout.extend(part) + rollout = rollout.concat(part) except queue.Empty: break return rollout @@ -195,7 +132,7 @@ def get_metrics(self): return completed -def _env_runner(env, policy, num_local_steps, horizon, obs_filter): +def _env_runner(env, policy, num_local_steps, horizon, obs_filter, pack): """This implements the logic of the thread runner. It continually runs the policy, and as long as the rollout exceeds a @@ -206,12 +143,16 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter): Args: env: Environment generated by env_creator policy: Policy used to interact with environment. Also sets fields - to be included in `PartialRollout` - num_local_steps: Number of steps before `PartialRollout` is yielded. + to be included in `SampleBatch` + num_local_steps: Number of steps before `SampleBatch` is yielded. Set + to infinity to yield complete episodes. + horizon: Horizon of the episode. obs_filter: Filter used to process observations. + pack: Whether to pack multiple episodes into each batch. This + guarantees batches will be exactly `num_local_steps` in size. Yields: - rollout (PartialRollout): Object containing state, action, reward, + rollout (SampleBatch): Object containing state, action, reward, terminal condition, and other fields as dictated by `policy`. """ last_observation = obs_filter(env.reset()) @@ -221,24 +162,23 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter): print("Warning, no horizon specified, assuming infinite") if not horizon: horizon = 999999 - if hasattr(policy, "get_initial_features"): - last_features = policy.get_initial_features() - else: - last_features = [] + last_features = policy.get_initial_state() features = last_features length = 0 rewards = 0 rollout_number = 0 while True: - terminal_end = False - rollout = PartialRollout(extra_fields=policy.other_output) + batch_builder = SampleBatchBuilder() for _ in range(num_local_steps): - action, pi_info = policy.compute(last_observation, *last_features) - if policy.is_recurrent: - features = pi_info["features"] - del pi_info["features"] + # Assume batch size one for now + action, features, pi_info = policy.compute_single_action( + last_observation, last_features, is_training=True) + for i, state_value in enumerate(last_features): + pi_info["state_in_{}".format(i)] = state_value + for i, state_value in enumerate(features): + pi_info["state_out_{}".format(i)] = state_value observation, reward, terminal, info = env.step(action) observation = obs_filter(observation) @@ -252,12 +192,11 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter): action = np.concatenate(action, axis=0).flatten() # Collect the experience. - rollout.add( + batch_builder.add_values( obs=last_observation, actions=action, rewards=reward, dones=terminal, - features=last_features, new_obs=observation, **pi_info) @@ -265,24 +204,18 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter): last_features = features if terminal: - terminal_end = True yield CompletedRollout(length, rewards) - if (length >= horizon - or not env.metadata.get("semantics.autoreset")): + if (length >= horizon or + not env.metadata.get("semantics.autoreset")): last_observation = obs_filter(env.reset()) - if hasattr(policy, "get_initial_features"): - last_features = policy.get_initial_features() - else: - last_features = [] + last_features = policy.get_initial_state() rollout_number += 1 length = 0 rewards = 0 - break - - if not terminal_end: - rollout.last_r = policy.value(last_observation, *last_features) + if not pack: + break # Once we have enough experience, yield it, and have the ThreadRunner # place it on a queue. - yield rollout + yield batch_builder.build() diff --git a/python/ray/rllib/utils/tf_policy_graph.py b/python/ray/rllib/utils/tf_policy_graph.py new file mode 100644 index 000000000000..6588060bfe29 --- /dev/null +++ b/python/ray/rllib/utils/tf_policy_graph.py @@ -0,0 +1,152 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +import ray +from ray.rllib.utils.policy_graph import PolicyGraph + + +class TFPolicyGraph(PolicyGraph): + """An agent policy and loss implemented in TensorFlow. + + Extending this class enables RLlib to perform TensorFlow specific + optimizations on the policy graph, e.g., parallelization across gpus or + fusing multiple graphs together in the multi-agent setting. + + All input and output tensors are of shape [BATCH_DIM, ...]. + + Examples: + >>> policy = TFPolicyGraphSubclass( + sess, obs_input, action_sampler, loss, loss_inputs, is_training) + + >>> print(policy.compute_actions([1, 0, 2])) + (array([0, 1, 1]), [], {}) + + >>> print(policy.postprocess_trajectory(SampleBatch({...}))) + SampleBatch({"action": ..., "advantages": ..., ...}) + """ + + def __init__( + self, sess, obs_input, action_sampler, loss, loss_inputs, + is_training, state_inputs=None, state_outputs=None): + """Initialize the policy. + + Arguments: + obs_input (Tensor): input placeholder for observations. + action_sampler (Tensor): Tensor for sampling an action. + loss (Tensor): scalar policy loss output tensor. + loss_inputs (list): a (name, placeholder) tuple for each loss + input argument. Each placeholder name must correspond to a + SampleBatch column key returned by postprocess_trajectory(). + is_training (Tensor): input placeholder for whether we are + currently training the policy. + state_inputs (list): list of RNN state output Tensors. + state_outputs (list): list of initial state values. + """ + + self._sess = sess + self._obs_input = obs_input + self._sampler = action_sampler + self._loss = loss + self._loss_inputs = loss_inputs + self._is_training = is_training + self._state_inputs = state_inputs or [] + self._state_outputs = state_outputs or [] + self._optimizer = self.optimizer() + self._grads_and_vars = self.gradients(self._optimizer) + self._grads = [g for (g, v) in self._grads_and_vars] + self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars) + self._variables = ray.experimental.TensorFlowVariables( + self._loss, self._sess) + + assert len(self._state_inputs) == len(self._state_outputs) == \ + len(self.get_initial_state()) + + def compute_actions( + self, obs_batch, state_batches=None, is_training=False): + state_batches = state_batches or [] + assert len(self._state_inputs) == len(state_batches), \ + (self._state_inputs, state_batches) + feed_dict = self.extra_compute_action_feed_dict() + feed_dict[self._obs_input] = obs_batch + feed_dict[self._is_training] = is_training + for ph, value in zip(self._state_inputs, state_batches): + feed_dict[ph] = value + fetches = self._sess.run( + ([self._sampler] + self._state_outputs + + [self.extra_compute_action_fetches()]), feed_dict=feed_dict) + return fetches[0], fetches[1:-1], fetches[-1] + + def _get_loss_inputs_dict(self, postprocessed_batch): + feed_dict = {} + for key, ph in self._loss_inputs: + # TODO(ekl) fix up handling of RNN inputs so that we can batch + # across multiple rollouts + if key.startswith("state_in_"): + feed_dict[ph] = postprocessed_batch[key][:1] # in state only + else: + feed_dict[ph] = postprocessed_batch[key] + return feed_dict + + def compute_gradients(self, postprocessed_batch): + feed_dict = self.extra_compute_grad_feed_dict() + feed_dict[self._is_training] = True + feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch)) + fetches = self._sess.run( + [self._grads, self.extra_compute_grad_fetches()], + feed_dict=feed_dict) + return fetches[0], fetches[1] + + def apply_gradients(self, gradients): + assert len(gradients) == len(self._grads), (gradients, self._grads) + feed_dict = self.extra_apply_grad_feed_dict() + feed_dict[self._is_training] = True + for ph, value in zip(self._grads, gradients): + feed_dict[ph] = value + fetches = self.sess.run( + [self._apply_op, self.extra_apply_grad_fetches()], + feed_dict=feed_dict) + return fetches[1] + + def compute_apply(self, postprocessed_batch): + feed_dict = self.extra_compute_grad_feed_dict() + feed_dict.update(self.extra_apply_grad_feed_dict()) + feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch)) + feed_dict[self._is_training] = True + fetches = self._sess.run( + [self._apply_op, self.extra_compute_grad_fetches(), + self.extra_apply_grad_fetches()], + feed_dict=feed_dict) + return fetches[1], fetches[2] + + def get_weights(self): + return self._variables.get_flat() + + def set_weights(self, weights): + return self._variables.set_flat(weights) + + def extra_compute_action_feed_dict(self): + return {} + + def extra_compute_action_fetches(self): + return {} # e.g, value function + + def extra_compute_grad_feed_dict(self): + return {} # e.g, kl_coeff + + def extra_compute_grad_fetches(self): + return {} # e.g, td error + + def extra_apply_grad_feed_dict(self): + return {} + + def extra_apply_grad_fetches(self): + return {} # e.g., batch norm updates + + def optimizer(self): + return tf.train.AdamOptimizer() + + def gradients(self, optimizer): + return optimizer.compute_gradients(self._loss) diff --git a/python/ray/tune/result.py b/python/ray/tune/result.py index 74ea2bcb9838..261ca6e90ff9 100644 --- a/python/ray/tune/result.py +++ b/python/ray/tune/result.py @@ -31,6 +31,12 @@ # (Optional) The mean episode reward if applicable. "episode_reward_mean", + # (Optional) The min episode reward if applicable. + "episode_reward_min", + + # (Optional) The max episode reward if applicable. + "episode_reward_max", + # (Optional) The mean episode length if applicable. "episode_len_mean", diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh index 8bd010c3c810..e18b5a6ef47c 100755 --- a/test/jenkins_tests/run_multi_node_tests.sh +++ b/test/jenkins_tests/run_multi_node_tests.sh @@ -208,6 +208,9 @@ docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/test/test_checkpoint_restore.py +docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ + python /ray/python/ray/rllib/test/test_common_policy_evaluator.py + docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/test/test_supported_spaces.py