Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
d0fec3e
dynamic graph
ericl May 15, 2019
b742efe
wip
ericl May 15, 2019
d8a722a
clean up
ericl May 16, 2019
169493c
fix
ericl May 16, 2019
fee8ec5
document trainer
ericl May 16, 2019
03b6021
wip
ericl May 16, 2019
48bdcf4
initialize the graph using a fake batch
ericl May 16, 2019
18b290b
clean up dynamic init
ericl May 16, 2019
71c91cd
wip
ericl May 16, 2019
2636703
wip
ericl May 16, 2019
d239a79
spelling
ericl May 16, 2019
3ff0d08
use builder for ppo pol graph
ericl May 16, 2019
e218d2c
add ppo graph
ericl May 16, 2019
9d9fd97
fix naming
ericl May 16, 2019
3093391
order
ericl May 16, 2019
e670abd
docs
ericl May 16, 2019
0aba2f3
set class name correctly
ericl May 16, 2019
298fcd0
add torch builder
ericl May 16, 2019
5269fe0
add custom model support in builder
ericl May 16, 2019
ac108dd
cleanup
ericl May 16, 2019
707acf8
remove underscores
ericl May 16, 2019
a4a9260
fix py2 compat
ericl May 17, 2019
a2281da
Update dynamic_tf_policy_graph.py
ericl May 17, 2019
817a1f9
Update tracking_dict.py
ericl May 17, 2019
a7229ce
wip
ericl May 17, 2019
a4c7779
Merge branch 'dynamic-graphs' of github.com:ericl/ray into dynamic-gr…
ericl May 17, 2019
4b9eb6d
rename
ericl May 17, 2019
182a3a5
Merge remote-tracking branch 'upstream/master' into dynamic-graphs
ericl May 17, 2019
6a1011e
debug level
ericl May 17, 2019
b1cecbe
rename policy_graph -> policy in new classes
ericl May 17, 2019
c857285
fix test
ericl May 17, 2019
64b267e
rename ppo tf policy
ericl May 17, 2019
52f06e9
port appo too
ericl May 17, 2019
3f64d4f
forgot grads
ericl May 17, 2019
0c6a22c
default policy optimizer
ericl May 18, 2019
c7e0320
make default config optional
ericl May 18, 2019
46523a0
add config to optimizer
ericl May 18, 2019
8a48029
use lr by default in optimizer
ericl May 18, 2019
65db45e
update
ericl May 18, 2019
4830ab6
comments
ericl May 18, 2019
816d590
remove optimizer
ericl May 18, 2019
65173a5
fix tuple actions support in dynamic tf graph
ericl May 18, 2019
db2a859
Merge remote-tracking branch 'upstream/master' into dynamic-graphs
ericl May 18, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/ray/rllib/agents/a3c/a3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ class A3CTrainer(Trainer):
def _init(self, config, env_creator):
if config["use_pytorch"]:
from ray.rllib.agents.a3c.a3c_torch_policy_graph import \
A3CTorchPolicyGraph
policy_cls = A3CTorchPolicyGraph
A3CTorchPolicy
policy_cls = A3CTorchPolicy
else:
policy_cls = self._policy_graph

Expand Down
173 changes: 74 additions & 99 deletions python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,109 +7,84 @@
from torch import nn

import ray
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.evaluation.policy_graph import PolicyGraph
from ray.rllib.evaluation.sample_batch import SampleBatch
from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
from ray.rllib.utils.annotations import override


class A3CLoss(nn.Module):
def __init__(self, dist_class, vf_loss_coeff=0.5, entropy_coeff=0.01):
nn.Module.__init__(self)
self.dist_class = dist_class
self.vf_loss_coeff = vf_loss_coeff
self.entropy_coeff = entropy_coeff

def forward(self, policy_model, observations, actions, advantages,
value_targets):
logits, _, values, _ = policy_model({
SampleBatch.CUR_OBS: observations
}, [])
dist = self.dist_class(logits)
log_probs = dist.logp(actions)
self.entropy = dist.entropy().mean()
self.pi_err = -advantages.dot(log_probs.reshape(-1))
self.value_err = F.mse_loss(values.reshape(-1), value_targets)
overall_err = sum([
self.pi_err,
self.vf_loss_coeff * self.value_err,
-self.entropy_coeff * self.entropy,
])

return overall_err


class A3CPostprocessing(object):
"""Adds the VF preds and advantages fields to the trajectory."""

@override(TorchPolicyGraph)
def extra_action_out(self, model_out):
return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}

@override(PolicyGraph)
def postprocess_trajectory(self,
sample_batch,
other_agent_batches=None,
episode=None):
completed = sample_batch[SampleBatch.DONES][-1]
if completed:
last_r = 0.0
else:
last_r = self._value(sample_batch[SampleBatch.NEXT_OBS][-1])
return compute_advantages(sample_batch, last_r, self.config["gamma"],
self.config["lambda"])


class A3CTorchPolicyGraph(A3CPostprocessing, TorchPolicyGraph):
"""A simple, non-recurrent PyTorch policy example."""

def __init__(self, obs_space, action_space, config):
config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
self.config = config
dist_class, self.logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"], torch=True)
model = ModelCatalog.get_torch_model(obs_space, self.logit_dim,
self.config["model"])
loss = A3CLoss(dist_class, self.config["vf_loss_coeff"],
self.config["entropy_coeff"])
TorchPolicyGraph.__init__(
self,
obs_space,
action_space,
model,
loss,
loss_inputs=[
SampleBatch.CUR_OBS, SampleBatch.ACTIONS,
Postprocessing.ADVANTAGES, Postprocessing.VALUE_TARGETS
],
action_distribution_cls=dist_class)

@override(TorchPolicyGraph)
def optimizer(self):
return torch.optim.Adam(self._model.parameters(), lr=self.config["lr"])

@override(TorchPolicyGraph)
def extra_grad_process(self):
info = {}
if self.config["grad_clip"]:
total_norm = nn.utils.clip_grad_norm_(self._model.parameters(),
self.config["grad_clip"])
info["grad_gnorm"] = total_norm
return info

@override(TorchPolicyGraph)
def extra_grad_info(self):
return {
"policy_entropy": self._loss.entropy.item(),
"policy_loss": self._loss.pi_err.item(),
"vf_loss": self._loss.value_err.item()
}

from ray.rllib.evaluation.torch_policy_template import build_torch_policy


def actor_critic_loss(policy, batch_tensors):
logits, _, values, _ = policy.model({
SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
}, [])
dist = policy.dist_class(logits)
log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS])
policy.entropy = dist.entropy().mean()
policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
log_probs.reshape(-1))
policy.value_err = F.mse_loss(
values.reshape(-1), batch_tensors[Postprocessing.VALUE_TARGETS])
overall_err = sum([
policy.pi_err,
policy.config["vf_loss_coeff"] * policy.value_err,
-policy.config["entropy_coeff"] * policy.entropy,
])
return overall_err


def loss_and_entropy_stats(policy, batch_tensors):
return {
"policy_entropy": policy.entropy.item(),
"policy_loss": policy.pi_err.item(),
"vf_loss": policy.value_err.item(),
}


def add_advantages(policy,
sample_batch,
other_agent_batches=None,
episode=None):
completed = sample_batch[SampleBatch.DONES][-1]
if completed:
last_r = 0.0
else:
last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1])
return compute_advantages(sample_batch, last_r, policy.config["gamma"],
policy.config["lambda"])


def model_value_predictions(policy, model_out):
return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}


def apply_grad_clipping(policy):
info = {}
if policy.config["grad_clip"]:
total_norm = nn.utils.clip_grad_norm_(policy.model.parameters(),
policy.config["grad_clip"])
info["grad_gnorm"] = total_norm
return info


def torch_optimizer(policy, config):
return torch.optim.Adam(policy.model.parameters(), lr=config["lr"])


class ValueNetworkMixin(object):
def _value(self, obs):
with self.lock:
obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device)
_, _, vf, _ = self._model({"obs": obs}, [])
_, _, vf, _ = self.model({"obs": obs}, [])
return vf.detach().cpu().numpy().squeeze()


A3CTorchPolicy = build_torch_policy(
name="A3CTorchPolicy",
get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
loss_fn=actor_critic_loss,
stats_fn=loss_and_entropy_stats,
postprocess_fn=add_advantages,
extra_action_out_fn=model_value_predictions,
extra_grad_process_fn=apply_grad_clipping,
optimizer_fn=torch_optimizer,
mixins=[ValueNetworkMixin])
54 changes: 14 additions & 40 deletions python/ray/rllib/agents/pg/pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@
from __future__ import division
from __future__ import print_function

from ray.rllib.agents.trainer import Trainer, with_common_config
from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph

from ray.rllib.optimizers import SyncSamplesOptimizer
from ray.rllib.utils.annotations import override
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy

# yapf: disable
# __sphinx_doc_begin__
Expand All @@ -22,40 +20,16 @@
# yapf: enable


class PGTrainer(Trainer):
"""Simple policy gradient agent.

This is an example agent to show how to implement algorithms in RLlib.
In most cases, you will probably want to use the PPO agent instead.
"""

_name = "PG"
_default_config = DEFAULT_CONFIG
_policy_graph = PGPolicyGraph
def get_policy_class(config):
if config["use_pytorch"]:
from ray.rllib.agents.pg.torch_pg_policy_graph import PGTorchPolicy
return PGTorchPolicy
else:
return PGTFPolicy

@override(Trainer)
def _init(self, config, env_creator):
if config["use_pytorch"]:
from ray.rllib.agents.pg.torch_pg_policy_graph import \
PGTorchPolicyGraph
policy_cls = PGTorchPolicyGraph
else:
policy_cls = self._policy_graph
self.local_evaluator = self.make_local_evaluator(
env_creator, policy_cls)
self.remote_evaluators = self.make_remote_evaluators(
env_creator, policy_cls, config["num_workers"])
optimizer_config = dict(
config["optimizer"],
**{"train_batch_size": config["train_batch_size"]})
self.optimizer = SyncSamplesOptimizer(
self.local_evaluator, self.remote_evaluators, **optimizer_config)

@override(Trainer)
def _train(self):
prev_steps = self.optimizer.num_steps_sampled
self.optimizer.step()
result = self.collect_metrics()
result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
prev_steps)
return result
PGTrainer = build_trainer(
name="PG",
default_config=DEFAULT_CONFIG,
default_policy=PGTFPolicy,
get_policy_class=get_policy_class)
105 changes: 18 additions & 87 deletions python/ray/rllib/agents/pg/pg_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,102 +3,33 @@
from __future__ import print_function

import ray
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.evaluation.policy_graph import PolicyGraph
from ray.rllib.evaluation.tf_policy_template import build_tf_policy
from ray.rllib.evaluation.sample_batch import SampleBatch
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
from ray.rllib.utils.annotations import override
from ray.rllib.utils import try_import_tf

tf = try_import_tf()


class PGLoss(object):
"""The basic policy gradient loss."""
# The basic policy gradients loss
def policy_gradient_loss(policy, batch_tensors):
actions = batch_tensors[SampleBatch.ACTIONS]
advantages = batch_tensors[Postprocessing.ADVANTAGES]
return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages)

def __init__(self, action_dist, actions, advantages):
self.loss = -tf.reduce_mean(action_dist.logp(actions) * advantages)

# This adds the "advantages" column to the sample batch.
def postprocess_advantages(policy,
sample_batch,
other_agent_batches=None,
episode=None):
return compute_advantages(
sample_batch, 0.0, policy.config["gamma"], use_gae=False)

class PGPostprocessing(object):
"""Adds the advantages field to the trajectory."""

@override(PolicyGraph)
def postprocess_trajectory(self,
sample_batch,
other_agent_batches=None,
episode=None):
# This adds the "advantages" column to the sample batch
return compute_advantages(
sample_batch, 0.0, self.config["gamma"], use_gae=False)


class PGPolicyGraph(PGPostprocessing, TFPolicyGraph):
"""Simple policy gradient example of defining a policy graph."""

def __init__(self, obs_space, action_space, config):
config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config)
self.config = config

# Setup placeholders
obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
prev_actions = ModelCatalog.get_action_placeholder(action_space)
prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")

# Create the model network and action outputs
self.model = ModelCatalog.get_model({
"obs": obs,
"prev_actions": prev_actions,
"prev_rewards": prev_rewards,
"is_training": self._get_is_training_placeholder(),
}, obs_space, action_space, self.logit_dim, self.config["model"])
action_dist = dist_class(self.model.outputs) # logit for each action

# Setup policy loss
actions = ModelCatalog.get_action_placeholder(action_space)
advantages = tf.placeholder(tf.float32, [None], name="adv")
loss = PGLoss(action_dist, actions, advantages).loss

# Mapping from sample batch keys to placeholders. These keys will be
# read from postprocessed sample batches and fed into the specified
# placeholders during loss computation.
loss_in = [
(SampleBatch.CUR_OBS, obs),
(SampleBatch.ACTIONS, actions),
(SampleBatch.PREV_ACTIONS, prev_actions),
(SampleBatch.PREV_REWARDS, prev_rewards),
(Postprocessing.ADVANTAGES, advantages),
]

# Initialize TFPolicyGraph
sess = tf.get_default_session()
TFPolicyGraph.__init__(
self,
obs_space,
action_space,
sess,
obs_input=obs,
action_sampler=action_dist.sample(),
action_prob=action_dist.sampled_action_prob(),
loss=loss,
loss_inputs=loss_in,
model=self.model,
state_inputs=self.model.state_in,
state_outputs=self.model.state_out,
prev_action_input=prev_actions,
prev_reward_input=prev_rewards,
seq_lens=self.model.seq_lens,
max_seq_len=config["model"]["max_seq_len"])
sess.run(tf.global_variables_initializer())

@override(PolicyGraph)
def get_initial_state(self):
return self.model.state_init

@override(TFPolicyGraph)
def optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
PGTFPolicy = build_tf_policy(
name="PGTFPolicy",
get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
postprocess_fn=postprocess_advantages,
loss_fn=policy_gradient_loss)
Loading