ray-project · alok · Jun 9, 2018 · Jun 9, 2018 · Jun 9, 2018 · Jun 9, 2018
@@ -41,7 +41,7 @@ TensorFlow documentation).
   b = tf.Variable(tf.zeros([1]))
   y = w * x_data + b
 
-  loss = tf.reduce_mean(tf.square(y - y_data))
+  loss = tf.reduce_mean((y - y_data)**2)
   optimizer = tf.train.GradientDescentOptimizer(0.5)
   grads = optimizer.compute_gradients(loss)
   train = optimizer.apply_gradients(grads)
@@ -115,7 +115,7 @@ complex Python objects.
           b = tf.Variable(tf.zeros([1]))
           y = w * self.x_data + b
           # Define the loss.
-          self.loss = tf.reduce_mean(tf.square(y - self.y_data))
+          self.loss = tf.reduce_mean((y - self.y_data)**2)
           optimizer = tf.train.GradientDescentOptimizer(0.5)
           self.grads = optimizer.compute_gradients(self.loss)
           self.train = optimizer.apply_gradients(self.grads)
@@ -246,7 +246,7 @@ For reference, the full code is below:
           b = tf.Variable(tf.zeros([1]))
           y = w * x_data + b
           # Define the loss.
-          self.loss = tf.reduce_mean(tf.square(y - y_data))
+          self.loss = tf.reduce_mean((y - y_data)**2)
           optimizer = tf.train.GradientDescentOptimizer(0.5)
           self.grads = optimizer.compute_gradients(self.loss)
           self.train = optimizer.apply_gradients(self.grads)
@@ -342,7 +342,7 @@ class definiton ``Network`` with a ``TensorFlowVariables`` instance:
       def __init__(self):
           a = tf.Variable(1)
           b = tf.Variable(1)
-          c = tf.add(a, b)
+          c = a + b
           sess = tf.Session()
           init = tf.global_variables_initializer()
           sess.run(init)

diff --git a/examples/resnet/resnet_model.py b/examples/resnet/resnet_model.py
@@ -263,7 +263,7 @@ def _decay(self):
             if var.op.name.find(r'DW') > 0:
                 costs.append(tf.nn.l2_loss(var))
 
-        return tf.multiply(self.hps.weight_decay_rate, tf.add_n(costs))
+        return self.hps.weight_decay_rate * tf.add_n(costs)
 
     def _conv(self, name, x, filter_size, in_filters, out_filters, strides):
         """Convolution."""

@@ -13,15 +13,30 @@
 
 
 def _register_all():
-    for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
-                "DDPG2", "APEX_DDPG", "__fake", "__sigmoid_fake_data",
-                "__parameter_tuning"]:
+    for key in [
+            "PPO",
+            "ES",
+            "DQN",
+            "APEX",
+            "A3C",
+            "BC",
+            "PG",
+            "DDPG",
+            "DDPG2",
+            "APEX_DDPG",
+            "__fake",
+            "__sigmoid_fake_data",
+            "__parameter_tuning",
+    ]:
         from ray.rllib.agent import get_agent_class
         register_trainable(key, get_agent_class(key))
 
 
 _register_all()
 
 __all__ = [
-    "PolicyGraph", "TFPolicyGraph", "CommonPolicyEvaluator", "SampleBatch"
+    "PolicyGraph",
+    "TFPolicyGraph",
+    "CommonPolicyEvaluator",
+    "SampleBatch",
 ]
@@ -147,10 +147,10 @@ def _restore(self, checkpoint_path):
         ])
         self.local_evaluator.restore(extra_data["local_state"])
 
-    def compute_action(self, observation, state=None):
-        if state is None:
-            state = []
+    def compute_action(self, observation, hidden_state=None):
+        if hidden_state is None:
+            hidden_state = []
         obs = self.local_evaluator.obs_filter(observation, update=False)
         return self.local_evaluator.for_policy(
             lambda p: p.compute_single_action(
-                obs, state, is_training=False)[0])
+                obs, hidden_state, is_training=False)[0])
@@ -69,7 +69,7 @@ def setup_loss(self, action_space):
         self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
 
         delta = self.vf - self.r
-        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
+        self.vf_loss = tf.reduce_sum(delta**2) / 2
         self.entropy = tf.reduce_sum(self.action_dist.entropy())
         self.loss = (self.pi_loss +
                      self.vf_loss * self.config["vf_loss_coeff"] +

@@ -33,8 +33,8 @@ def setup_graph(self, obs_space, action_space):
         self.optimizer = torch.optim.Adam(
             self._model.parameters(), lr=self.config["lr"])
 
-    def compute_single_action(self, obs, state, is_training=False):
-        assert not state, "RNN not supported"
+    def compute_single_action(self, obs, hidden_state, is_training=False):
+        assert not hidden_state, "RNN not supported"
         with self.lock:
             ob = torch.from_numpy(obs).float().unsqueeze(0)
             logits, values = self._model(ob)

@@ -14,7 +14,6 @@
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
 
-
 A_SCOPE = "a_func"
 P_SCOPE = "p_func"
 P_TARGET_SCOPE = "target_p_func"
@@ -189,22 +188,21 @@ def __init__(self, observation_space, action_space, registry, config):
         if config.get("use_huber"):
             errors = _huber_loss(self.td_error, config.get("huber_threshold"))
         else:
-            errors = 0.5 * tf.square(self.td_error)
+            errors = self.td_error**2 / 2
 
         self.loss = tf.reduce_mean(self.importance_weights * errors)
 
         # for policy gradient
-        self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)
+        self.actor_loss = -tf.reduce_mean(q_tp0)
 
         if config["l2_reg"] is not None:
             for var in self.p_func_vars:
                 if "bias" not in var.name:
                     self.actor_loss += (
-                        config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
+                            config["l2_reg"] * tf.nn.l2_loss(var)) / 2
             for var in self.q_func_vars:
                 if "bias" not in var.name:
-                    self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
-                        var)
+                    self.loss += config["l2_reg"] * tf.nn.l2_loss(var) / 2
 
         # update_target_fn will be called periodically to copy Q network to
         # target Q network

@@ -196,8 +196,8 @@ def _setup_critic_loss(self, action_space):
         self.reward = tf.placeholder(tf.float32, [None], name="reward")
         self.critic_target = tf.expand_dims(self.reward, 1) + \
             self.config['gamma'] * self.target_Q
-        self.critic_loss = tf.reduce_mean(tf.square(
-                          self.critic_target - self.critic_eval))
+        self.critic_loss = tf.reduce_mean(
+                    (self.critic_target - self.critic_eval)**2)
 
     def _setup_critic_network(self, obs_space, ac_space):
         """Sets up Q network."""

@@ -193,20 +193,20 @@ def _train(self):
             self.optimizer.step()
             self.update_target_if_needed()
 
-        exp_vals = [self.exploration0.value(self.global_timestep)]
+        exploration_vals = [self.exploration0.value(self.global_timestep)]
         self.local_evaluator.for_policy(
-            lambda p: p.set_epsilon(exp_vals[0]))
+            lambda p: p.set_epsilon(exploration_vals[0]))
         for i, e in enumerate(self.remote_evaluators):
-            exp_val = self.explorations[i].value(self.global_timestep)
-            e.for_policy.remote(lambda p: p.set_epsilon(exp_val))
-            exp_vals.append(exp_val)
+            exploration_val = self.explorations[i].value(self.global_timestep)
+            e.for_policy.remote(lambda p: p.set_epsilon(exploration_val))
+            exploration_vals.append(exploration_val)
 
         result = collect_metrics(
             self.local_evaluator, self.remote_evaluators)
         return result._replace(
             info=dict({
-                "min_exploration": min(exp_vals),
-                "max_exploration": max(exp_vals),
+                "min_exploration": min(exploration_vals),
+                "max_exploration": max(exploration_vals),
                 "num_target_updates": self.num_target_updates,
             }, **self.optimizer.stats()))
 
@@ -237,9 +237,9 @@ def _restore(self, checkpoint_path):
         self.num_target_updates = extra_data[3]
         self.last_target_update_ts = extra_data[4]
 
-    def compute_action(self, observation, state=None):
-        if state is None:
-            state = []
+    def compute_action(self, observation, hidden_state=None):
+        if hidden_state is None:
+            hidden_state = []
         return self.local_evaluator.for_policy(
             lambda p: p.compute_single_action(
-                observation, state, is_training=False)[0])
+                observation, hidden_state, is_training=False)[0])
@@ -12,7 +12,6 @@
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
 
-
 Q_SCOPE = "q_func"
 Q_TARGET_SCOPE = "target_q_func"
 
@@ -286,9 +285,9 @@ def _build_action_network(
 def _huber_loss(x, delta=1.0):
     """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
     return tf.where(
-        tf.abs(x) < delta,
-        tf.square(x) * 0.5,
-        delta * (tf.abs(x) - 0.5 * delta))
+        abs(x) < delta,
+        x**2 / 2 ,
+        delta * (abs(x) - delta / 2))
 
 
 def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):

@@ -29,7 +29,7 @@ def __init__(self):
         self.reset()
 
     def step(self, action):
-        summed_act = 0.5 * np.sum(action)
+        summed_act = np.sum(action) / 2
 
         position, velocity = self.state
         velocity += (summed_act - 1) * 0.001

@@ -6,6 +6,7 @@
 import numpy as np
 from ray.rllib.utils.reshaper import Reshaper
 
+# TODO(alok): Use tf/torch Distributions to clean this up.
 
 class ActionDistribution(object):
     """The policy action distribution of an agent.
@@ -81,22 +82,23 @@ def __init__(self, inputs):
         self.std = tf.exp(log_std)
 
     def logp(self, x):
-        return (-0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std),
-                                     reduction_indices=[1]) -
-                0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) -
+        return (-tf.reduce_sum(((x - self.mean) / self.std)**2,
+                          reduction_indices=[1]) / 2 -
+                np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1] / 2) -
                 tf.reduce_sum(self.log_std, reduction_indices=[1]))
 
     def kl(self, other):
         assert isinstance(other, DiagGaussian)
-        return tf.reduce_sum(other.log_std - self.log_std +
-                             (tf.square(self.std) +
-                              tf.square(self.mean - other.mean)) /
-                             (2.0 * tf.square(other.std)) - 0.5,
-                             reduction_indices=[1])
+        return tf.reduce_sum(
+            other.log_std - self.log_std +
+            (self.std**2 + (self.mean - other.mean)**2) /
+            (2.0 * (other.std)**2) - 0.5,
+            reduction_indices=[1])
 
     def entropy(self):
-        return tf.reduce_sum(self.log_std + .5 * np.log(2.0 * np.pi * np.e),
-                             reduction_indices=[1])
+        return tf.reduce_sum(
+            self.log_std + np.log(2.0 * np.pi * np.e) / 2,
+            reduction_indices=[1])
 
     def sample(self):
         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))

@@ -24,7 +24,7 @@ def _init(self, inputs, num_outputs, options):
         out = slim.fully_connected(
              net, num_outputs, activation_fn=tf.nn.tanh,
              weights_initializer=w_init)
-        scaled_out = tf.multiply(out, ac_bound)
+        scaled_out = out * ac_bound
         return scaled_out, net
 
 
@@ -42,7 +42,7 @@ def _init(self, inputs, num_outputs, options):
             weights_initializer=w_normal)
         t2 = slim.fully_connected(
             action, 300, activation_fn=None, weights_initializer=w_normal)
-        net = tf.nn.relu(tf.add(t1, t2))
+        net = tf.nn.relu(t1 + t2)
 
         out = slim.fully_connected(
              net, 1, activation_fn=None, weights_initializer=w_init)

@@ -8,13 +8,13 @@
 
 
 def convert_batch(trajectory):
-    """Convert trajectory from numpy to PT variable"""
+    """Convert trajectory from NumPy to PyTorch tensor"""
     states = torch.from_numpy(trajectory["obs"]).float()
-    acs = torch.from_numpy(trajectory["actions"])
-    advs = torch.from_numpy(
-        trajectory["advantages"].copy()).float().reshape(-1)
-    rs = torch.from_numpy(trajectory["rewards"]).float().reshape(-1)
-    return states, acs, advs, rs
+    actions = torch.from_numpy(trajectory["actions"])
+    # `torch.tensor()` implicitly copies, unlike `torch.from_numpy`
+    advantages = torch.tensor(trajectory["advantages"]).float().reshape(-1)
+    rewards = torch.from_numpy(trajectory["rewards"]).float().reshape(-1)
+    return states, actions, advantages, rewards
 
 
 def var_to_np(var):
@@ -24,8 +24,7 @@ def var_to_np(var):
 def normc_initializer(std=1.0):
     def initializer(tensor):
         tensor.data.normal_(0, 1)
-        tensor.data *= std / torch.sqrt(
-            tensor.data.pow(2).sum(1, keepdim=True))
+        tensor.data *= std / (tensor.data**2).sum(1, keepdim=True).sqrt()
 
     return initializer
 

@@ -66,9 +66,9 @@ def _train(self):
         return collect_metrics(
             self.optimizer.local_evaluator, self.optimizer.remote_evaluators)
 
-    def compute_action(self, observation, state=None):
-        if state is None:
-            state = []
+    def compute_action(self, observation, hidden_state=None):
+        if hidden_state is None:
+            hidden_state = []
         return self.local_evaluator.for_policy(
             lambda p: p.compute_single_action(
-                observation, state, is_training=False)[0])
+                observation, hidden_state, is_training=False)[0])
@@ -55,11 +55,11 @@ def __init__(
             # We use a huber loss here to be more robust against outliers,
             # which seem to occur when the rollouts get longer (the variance
             # scales superlinearly with the length of the rollout)
-            self.vf_loss1 = tf.square(self.value_function - value_targets)
+            self.vf_loss1 = (self.value_function - value_targets)**2
             vf_clipped = prev_vf_preds + tf.clip_by_value(
                 self.value_function - prev_vf_preds,
                 -config["clip_param"], config["clip_param"])
-            self.vf_loss2 = tf.square(vf_clipped - value_targets)
+            self.vf_loss2 = (vf_clipped - value_targets)**2
             self.vf_loss = tf.minimum(self.vf_loss1, self.vf_loss2)
             self.mean_vf_loss = tf.reduce_mean(self.vf_loss)
             self.loss = tf.reduce_mean(
@@ -92,4 +92,6 @@ def get_initial_state(self):
         return []
 
     def loss(self):
+        # TODO(ericl): this name (`self.loss`) conflicts with the self.loss
+        # defined earlier in this file.
         return self.loss
@@ -226,7 +226,7 @@ def standardized(value):
         if kl > 2.0 * config["kl_target"]:
             self.kl_coeff *= 1.5
         elif kl < 0.5 * config["kl_target"]:
-            self.kl_coeff *= 0.5
+            self.kl_coeff /= 2
 
         info = {
             "kl_divergence": kl,

@@ -13,7 +13,7 @@
 
 
 class MockPolicyGraph(PolicyGraph):
-    def compute_actions(self, obs_batch, state_batches, is_training=False):
+    def compute_actions(self, obs_batch, hidden_state_batches, is_training=False):
         return [0] * len(obs_batch), [], {}
 
     def postprocess_trajectory(self, batch):