Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions doc/source/using-ray-with-tensorflow.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ TensorFlow documentation).
b = tf.Variable(tf.zeros([1]))
y = w * x_data + b

loss = tf.reduce_mean(tf.square(y - y_data))
loss = tf.reduce_mean((y - y_data)**2)
optimizer = tf.train.GradientDescentOptimizer(0.5)
grads = optimizer.compute_gradients(loss)
train = optimizer.apply_gradients(grads)
Expand Down Expand Up @@ -115,7 +115,7 @@ complex Python objects.
b = tf.Variable(tf.zeros([1]))
y = w * self.x_data + b
# Define the loss.
self.loss = tf.reduce_mean(tf.square(y - self.y_data))
self.loss = tf.reduce_mean((y - self.y_data)**2)
optimizer = tf.train.GradientDescentOptimizer(0.5)
self.grads = optimizer.compute_gradients(self.loss)
self.train = optimizer.apply_gradients(self.grads)
Expand Down Expand Up @@ -246,7 +246,7 @@ For reference, the full code is below:
b = tf.Variable(tf.zeros([1]))
y = w * x_data + b
# Define the loss.
self.loss = tf.reduce_mean(tf.square(y - y_data))
self.loss = tf.reduce_mean((y - y_data)**2)
optimizer = tf.train.GradientDescentOptimizer(0.5)
self.grads = optimizer.compute_gradients(self.loss)
self.train = optimizer.apply_gradients(self.grads)
Expand Down Expand Up @@ -342,7 +342,7 @@ class definiton ``Network`` with a ``TensorFlowVariables`` instance:
def __init__(self):
a = tf.Variable(1)
b = tf.Variable(1)
c = tf.add(a, b)
c = a + b
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
Expand Down
2 changes: 1 addition & 1 deletion examples/resnet/resnet_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def _decay(self):
if var.op.name.find(r'DW') > 0:
costs.append(tf.nn.l2_loss(var))

return tf.multiply(self.hps.weight_decay_rate, tf.add_n(costs))
return self.hps.weight_decay_rate * tf.add_n(costs)

def _conv(self, name, x, filter_size, in_filters, out_filters, strides):
"""Convolution."""
Expand Down
23 changes: 19 additions & 4 deletions python/ray/rllib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,30 @@


def _register_all():
for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
"DDPG2", "APEX_DDPG", "__fake", "__sigmoid_fake_data",
"__parameter_tuning"]:
for key in [
"PPO",
"ES",
"DQN",
"APEX",
"A3C",
"BC",
"PG",
"DDPG",
"DDPG2",
"APEX_DDPG",
"__fake",
"__sigmoid_fake_data",
"__parameter_tuning",
]:
from ray.rllib.agent import get_agent_class
register_trainable(key, get_agent_class(key))


_register_all()

__all__ = [
"PolicyGraph", "TFPolicyGraph", "CommonPolicyEvaluator", "SampleBatch"
"PolicyGraph",
"TFPolicyGraph",
"CommonPolicyEvaluator",
"SampleBatch",
]
8 changes: 4 additions & 4 deletions python/ray/rllib/a3c/a3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,10 @@ def _restore(self, checkpoint_path):
])
self.local_evaluator.restore(extra_data["local_state"])

def compute_action(self, observation, state=None):
if state is None:
state = []
def compute_action(self, observation, hidden_state=None):
if hidden_state is None:
hidden_state = []
obs = self.local_evaluator.obs_filter(observation, update=False)
return self.local_evaluator.for_policy(
lambda p: p.compute_single_action(
obs, state, is_training=False)[0])
obs, hidden_state, is_training=False)[0])
2 changes: 1 addition & 1 deletion python/ray/rllib/a3c/a3c_tf_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def setup_loss(self, action_space):
self.pi_loss = - tf.reduce_sum(log_prob * self.adv)

delta = self.vf - self.r
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
self.vf_loss = tf.reduce_sum(delta**2) / 2
self.entropy = tf.reduce_sum(self.action_dist.entropy())
self.loss = (self.pi_loss +
self.vf_loss * self.config["vf_loss_coeff"] +
Expand Down
4 changes: 2 additions & 2 deletions python/ray/rllib/a3c/a3c_torch_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def setup_graph(self, obs_space, action_space):
self.optimizer = torch.optim.Adam(
self._model.parameters(), lr=self.config["lr"])

def compute_single_action(self, obs, state, is_training=False):
assert not state, "RNN not supported"
def compute_single_action(self, obs, hidden_state, is_training=False):
assert not hidden_state, "RNN not supported"
with self.lock:
ob = torch.from_numpy(obs).float().unsqueeze(0)
logits, values = self._model(ob)
Expand Down
10 changes: 4 additions & 6 deletions python/ray/rllib/ddpg/ddpg_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph


A_SCOPE = "a_func"
P_SCOPE = "p_func"
P_TARGET_SCOPE = "target_p_func"
Expand Down Expand Up @@ -189,22 +188,21 @@ def __init__(self, observation_space, action_space, registry, config):
if config.get("use_huber"):
errors = _huber_loss(self.td_error, config.get("huber_threshold"))
else:
errors = 0.5 * tf.square(self.td_error)
errors = self.td_error**2 / 2

self.loss = tf.reduce_mean(self.importance_weights * errors)

# for policy gradient
self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)
self.actor_loss = -tf.reduce_mean(q_tp0)

if config["l2_reg"] is not None:
for var in self.p_func_vars:
if "bias" not in var.name:
self.actor_loss += (
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
config["l2_reg"] * tf.nn.l2_loss(var)) / 2
for var in self.q_func_vars:
if "bias" not in var.name:
self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
var)
self.loss += config["l2_reg"] * tf.nn.l2_loss(var) / 2

# update_target_fn will be called periodically to copy Q network to
# target Q network
Expand Down
4 changes: 2 additions & 2 deletions python/ray/rllib/ddpg2/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ def _setup_critic_loss(self, action_space):
self.reward = tf.placeholder(tf.float32, [None], name="reward")
self.critic_target = tf.expand_dims(self.reward, 1) + \
self.config['gamma'] * self.target_Q
self.critic_loss = tf.reduce_mean(tf.square(
self.critic_target - self.critic_eval))
self.critic_loss = tf.reduce_mean(
(self.critic_target - self.critic_eval)**2)

def _setup_critic_network(self, obs_space, ac_space):
"""Sets up Q network."""
Expand Down
22 changes: 11 additions & 11 deletions python/ray/rllib/dqn/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,20 +193,20 @@ def _train(self):
self.optimizer.step()
self.update_target_if_needed()

exp_vals = [self.exploration0.value(self.global_timestep)]
exploration_vals = [self.exploration0.value(self.global_timestep)]
self.local_evaluator.for_policy(
lambda p: p.set_epsilon(exp_vals[0]))
lambda p: p.set_epsilon(exploration_vals[0]))
for i, e in enumerate(self.remote_evaluators):
exp_val = self.explorations[i].value(self.global_timestep)
e.for_policy.remote(lambda p: p.set_epsilon(exp_val))
exp_vals.append(exp_val)
exploration_val = self.explorations[i].value(self.global_timestep)
e.for_policy.remote(lambda p: p.set_epsilon(exploration_val))
exploration_vals.append(exploration_val)

result = collect_metrics(
self.local_evaluator, self.remote_evaluators)
return result._replace(
info=dict({
"min_exploration": min(exp_vals),
"max_exploration": max(exp_vals),
"min_exploration": min(exploration_vals),
"max_exploration": max(exploration_vals),
"num_target_updates": self.num_target_updates,
}, **self.optimizer.stats()))

Expand Down Expand Up @@ -237,9 +237,9 @@ def _restore(self, checkpoint_path):
self.num_target_updates = extra_data[3]
self.last_target_update_ts = extra_data[4]

def compute_action(self, observation, state=None):
if state is None:
state = []
def compute_action(self, observation, hidden_state=None):
if hidden_state is None:
hidden_state = []
return self.local_evaluator.for_policy(
lambda p: p.compute_single_action(
observation, state, is_training=False)[0])
observation, hidden_state, is_training=False)[0])
7 changes: 3 additions & 4 deletions python/ray/rllib/dqn/dqn_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph


Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"

Expand Down Expand Up @@ -286,9 +285,9 @@ def _build_action_network(
def _huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where(
tf.abs(x) < delta,
tf.square(x) * 0.5,
delta * (tf.abs(x) - 0.5 * delta))
abs(x) < delta,
x**2 / 2 ,
delta * (abs(x) - delta / 2))


def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):
Expand Down
2 changes: 1 addition & 1 deletion python/ray/rllib/examples/multiagent_mountaincar_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self):
self.reset()

def step(self, action):
summed_act = 0.5 * np.sum(action)
summed_act = np.sum(action) / 2

position, velocity = self.state
velocity += (summed_act - 1) * 0.001
Expand Down
22 changes: 12 additions & 10 deletions python/ray/rllib/models/action_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
from ray.rllib.utils.reshaper import Reshaper

# TODO(alok): Use tf/torch Distributions to clean this up.

class ActionDistribution(object):
"""The policy action distribution of an agent.
Expand Down Expand Up @@ -81,22 +82,23 @@ def __init__(self, inputs):
self.std = tf.exp(log_std)

def logp(self, x):
return (-0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std),
reduction_indices=[1]) -
0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) -
return (-tf.reduce_sum(((x - self.mean) / self.std)**2,
reduction_indices=[1]) / 2 -
np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1] / 2) -
tf.reduce_sum(self.log_std, reduction_indices=[1]))

def kl(self, other):
assert isinstance(other, DiagGaussian)
return tf.reduce_sum(other.log_std - self.log_std +
(tf.square(self.std) +
tf.square(self.mean - other.mean)) /
(2.0 * tf.square(other.std)) - 0.5,
reduction_indices=[1])
return tf.reduce_sum(
other.log_std - self.log_std +
(self.std**2 + (self.mean - other.mean)**2) /
(2.0 * (other.std)**2) - 0.5,
reduction_indices=[1])

def entropy(self):
return tf.reduce_sum(self.log_std + .5 * np.log(2.0 * np.pi * np.e),
reduction_indices=[1])
return tf.reduce_sum(
self.log_std + np.log(2.0 * np.pi * np.e) / 2,
reduction_indices=[1])

def sample(self):
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
Expand Down
4 changes: 2 additions & 2 deletions python/ray/rllib/models/ddpgnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def _init(self, inputs, num_outputs, options):
out = slim.fully_connected(
net, num_outputs, activation_fn=tf.nn.tanh,
weights_initializer=w_init)
scaled_out = tf.multiply(out, ac_bound)
scaled_out = out * ac_bound
return scaled_out, net


Expand All @@ -42,7 +42,7 @@ def _init(self, inputs, num_outputs, options):
weights_initializer=w_normal)
t2 = slim.fully_connected(
action, 300, activation_fn=None, weights_initializer=w_normal)
net = tf.nn.relu(tf.add(t1, t2))
net = tf.nn.relu(t1 + t2)

out = slim.fully_connected(
net, 1, activation_fn=None, weights_initializer=w_init)
Expand Down
15 changes: 7 additions & 8 deletions python/ray/rllib/models/pytorch/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@


def convert_batch(trajectory):
"""Convert trajectory from numpy to PT variable"""
"""Convert trajectory from NumPy to PyTorch tensor"""
states = torch.from_numpy(trajectory["obs"]).float()
acs = torch.from_numpy(trajectory["actions"])
advs = torch.from_numpy(
trajectory["advantages"].copy()).float().reshape(-1)
rs = torch.from_numpy(trajectory["rewards"]).float().reshape(-1)
return states, acs, advs, rs
actions = torch.from_numpy(trajectory["actions"])
# `torch.tensor()` implicitly copies, unlike `torch.from_numpy`
advantages = torch.tensor(trajectory["advantages"]).float().reshape(-1)
rewards = torch.from_numpy(trajectory["rewards"]).float().reshape(-1)
return states, actions, advantages, rewards


def var_to_np(var):
Expand All @@ -24,8 +24,7 @@ def var_to_np(var):
def normc_initializer(std=1.0):
def initializer(tensor):
tensor.data.normal_(0, 1)
tensor.data *= std / torch.sqrt(
tensor.data.pow(2).sum(1, keepdim=True))
tensor.data *= std / (tensor.data**2).sum(1, keepdim=True).sqrt()

return initializer

Expand Down
8 changes: 4 additions & 4 deletions python/ray/rllib/pg/pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ def _train(self):
return collect_metrics(
self.optimizer.local_evaluator, self.optimizer.remote_evaluators)

def compute_action(self, observation, state=None):
if state is None:
state = []
def compute_action(self, observation, hidden_state=None):
if hidden_state is None:
hidden_state = []
return self.local_evaluator.for_policy(
lambda p: p.compute_single_action(
observation, state, is_training=False)[0])
observation, hidden_state, is_training=False)[0])
6 changes: 4 additions & 2 deletions python/ray/rllib/ppo/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ def __init__(
# We use a huber loss here to be more robust against outliers,
# which seem to occur when the rollouts get longer (the variance
# scales superlinearly with the length of the rollout)
self.vf_loss1 = tf.square(self.value_function - value_targets)
self.vf_loss1 = (self.value_function - value_targets)**2
vf_clipped = prev_vf_preds + tf.clip_by_value(
self.value_function - prev_vf_preds,
-config["clip_param"], config["clip_param"])
self.vf_loss2 = tf.square(vf_clipped - value_targets)
self.vf_loss2 = (vf_clipped - value_targets)**2
self.vf_loss = tf.minimum(self.vf_loss1, self.vf_loss2)
self.mean_vf_loss = tf.reduce_mean(self.vf_loss)
self.loss = tf.reduce_mean(
Expand Down Expand Up @@ -92,4 +92,6 @@ def get_initial_state(self):
return []

def loss(self):
# TODO(ericl): this name (`self.loss`) conflicts with the self.loss
# defined earlier in this file.
return self.loss
2 changes: 1 addition & 1 deletion python/ray/rllib/ppo/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def standardized(value):
if kl > 2.0 * config["kl_target"]:
self.kl_coeff *= 1.5
elif kl < 0.5 * config["kl_target"]:
self.kl_coeff *= 0.5
self.kl_coeff /= 2

info = {
"kl_divergence": kl,
Expand Down
2 changes: 1 addition & 1 deletion python/ray/rllib/test/test_common_policy_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


class MockPolicyGraph(PolicyGraph):
def compute_actions(self, obs_batch, state_batches, is_training=False):
def compute_actions(self, obs_batch, hidden_state_batches, is_training=False):
return [0] * len(obs_batch), [], {}

def postprocess_trajectory(self, batch):
Expand Down
Loading