From efe8de026e2431af8f91b164b1c3fd535c0722d0 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 12:50:49 +0530 Subject: [PATCH 01/18] add ppo rl lightning template --- .../domain_templates/reinforce_learn_ppo.py | 483 ++++++++++++++++++ 1 file changed, 483 insertions(+) create mode 100644 pl_examples/domain_templates/reinforce_learn_ppo.py diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py new file mode 100644 index 0000000000000..c6b9650840026 --- /dev/null +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -0,0 +1,483 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +PyTorch Lightning implementation of Proximal Policy Optimization (PPO) + +Paper authors: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov + +The example implements PPO compatible to work with any continous or discrete action-space environments via OpenAI Gym. + +To run the template, just run: +`python reinforce_learn_ppo.py` + +References +---------- +[1] https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py +[2] https://github.com/openai/spinningup +""" +import argparse +from typing import List, Tuple, Callable, Iterable + +import pytorch_lightning as pl +from pl_examples import cli_lightning_logo + +import torch +from torch import nn +from torch.distributions import Categorical, Normal +from torch.utils.data import DataLoader, IterableDataset +import torch.optim as optim +from torch.optim.optimizer import Optimizer + +import numpy as np + +try: + import gym +except ModuleNotFoundError: + _GYM_AVAILABLE = False +else: + _GYM_AVAILABLE = True + + +def create_mlp(input_shape: Tuple[int], n_actions: int, hidden_size: int = 128): + """ + Simple Multi-Layer Perceptron network + """ + network = nn.Sequential( + nn.Linear(input_shape[0], hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, n_actions) + ) + + return network + + +class ActorCategorical(nn.Module): + """ + Policy network, for discrete action spaces, which returns a distribution + and an action given an observation + """ + + def __init__(self, actor_net): + """ + Args: + input_shape: observation shape of the environment + n_actions: number of discrete actions available in the environment + """ + super().__init__() + + self.actor_net = actor_net + + def forward(self, states): + logits = self.actor_net(states) + pi = Categorical(logits=logits) + actions = pi.sample() + + return pi, actions + + def get_log_prob(self, pi: Categorical, actions: torch.Tensor): + """ + Takes in a distribution and actions and returns log prob of actions + under the distribution + Args: + pi: torch distribution + actions: actions taken by distribution + Returns: + log probability of the acition under pi + """ + return pi.log_prob(actions) + + +class ActorContinous(nn.Module): + """ + Policy network, for continous action spaces, which returns a distribution + and an action given an observation + """ + + def __init__(self, actor_net, act_dim): + """ + Args: + input_shape: observation shape of the environment + n_actions: number of discrete actions available in the environment + """ + super().__init__() + self.actor_net = actor_net + log_std = -0.5 * torch.ones(act_dim, dtype=torch.float) + self.log_std = nn.Parameter(log_std) + + def forward(self, states): + mu = self.actor_net(states) + std = torch.exp(self.log_std) + pi = Normal(loc=mu, scale=std) + actions = pi.sample() + + return pi, actions + + def get_log_prob(self, pi: Normal, actions: torch.Tensor): + """ + Takes in a distribution and actions and returns log prob of actions + under the distribution + Args: + pi: torch distribution + actions: actions taken by distribution + Returns: + log probability of the acition under pi + """ + return pi.log_prob(actions).sum(axis=-1) + + +class ExperienceSourceDataset(IterableDataset): + """ + Implementation from PyTorch Lightning Bolts: + https://github.com/PyTorchLightning/pytorch-lightning-bolts/blob/master/pl_bolts/datamodules/experience_source.py + + Basic experience source dataset. Takes a generate_batch function that returns an iterator. + The logic for the experience source and how the batch is generated is defined the Lightning model itself + """ + + def __init__(self, generate_batch: Callable): + self.generate_batch = generate_batch + + def __iter__(self) -> Iterable: + iterator = self.generate_batch() + return iterator + + +class PPOLightning(pl.LightningModule): + """ + PyTorch Lightning implementation of `PPO. + + Example: + model = PPOLightning("CartPole-v0") + Train: + trainer = Trainer() + trainer.fit(model) + """ + def __init__( + self, + env: str, + gamma: float = 0.99, + lam: float = 0.95, + lr_actor: float = 3e-4, + lr_critic: float = 1e-3, + max_episode_len: float = 200, + batch_size: int = 512, + steps_per_epoch: int = 2048, + nb_optim_iters: int = 4, + clip_ratio: float = 0.2, + ) -> None: + + """ + Args: + env: gym environment tag + gamma: discount factor + lam: advantage discount factor (lambda in the paper) + lr_actor: learning rate of actor network + lr_critic: learning rate of critic network + max_episode_len: maximum number interactions (actions) in an episode + batch_size: batch_size when training network- can simulate number of policy updates performed per epoch + steps_per_epoch: how many action-state pairs to rollout for trajectory collection per epoch + nb_optim_iters: how many steps of gradient descent to perform on each batch + clip_ratio: hyperparameter for clipping in the policy objective + """ + super().__init__() + + if not _GYM_AVAILABLE: + raise ModuleNotFoundError('This Module requires gym environment which is not installed yet.') + + # Hyperparameters + self.lr_actor = lr_actor + self.lr_critic = lr_critic + self.steps_per_epoch = steps_per_epoch + self.nb_optim_iters = nb_optim_iters + self.batch_size = batch_size + self.gamma = gamma + self.lam = lam + self.max_episode_len = max_episode_len + self.clip_ratio = clip_ratio + self.save_hyperparameters() + + self.env = gym.make(env) + # value network + self.critic = create_mlp(self.env.observation_space.shape, 1) + # policy network (agent) + if type(self.env.action_space) == gym.spaces.box.Box: + act_dim = self.env.action_space.shape[0] + actor_mlp = create_mlp(self.env.observation_space.shape, act_dim) + self.actor = ActorContinous(actor_mlp, act_dim) + elif type(self.env.action_space) == gym.spaces.discrete.Discrete: + actor_mlp = create_mlp(self.env.observation_space.shape, self.env.action_space.n) + self.actor = ActorCategorical(actor_mlp) + else: + raise NotImplementedError('Env action space should be of type Box (continous) or Discrete (categorical)' + 'Got type: ', type(self.env.action_space)) + + self.batch_states = [] + self.batch_actions = [] + self.batch_adv = [] + self.batch_qvals = [] + self.batch_logp = [] + + self.ep_rewards = [] + self.ep_values = [] + self.epoch_rewards = [] + + self.episode_step = 0 + self.avg_ep_reward = 0 + self.avg_ep_len = 0 + self.avg_reward = 0 + + self.state = torch.FloatTensor(self.env.reset()) + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Passes in a state x through the network and returns the policy and a sampled action + Args: + x: environment state + Returns: + Tuple of policy and action + """ + pi, action = self.actor(x) + value = self.critic(x) + + return pi, action, value + + def discount_rewards(self, rewards: List[float], discount: float) -> List[float]: + """Calculate the discounted rewards of all rewards in list + Args: + rewards: list of rewards/advantages + Returns: + list of discounted rewards/advantages + """ + assert isinstance(rewards[0], float) + + cumul_reward = [] + sum_r = 0.0 + + for r in reversed(rewards): + sum_r = (sum_r * discount) + r + cumul_reward.append(sum_r) + + return list(reversed(cumul_reward)) + + def calc_advantage(self, rewards: List[float], values: List[float], last_value: float) -> List[float]: + """Calculate the advantage given rewards, state values, and the last value of episode + Args: + rewards: list of episode rewards + values: list of state values from critic + last_value: value of last state of episode + Returns: + list of advantages + """ + rews = rewards + [last_value] + vals = values + [last_value] + # GAE + delta = [rews[i] + self.gamma * vals[i + 1] - vals[i] for i in range(len(rews) - 1)] + adv = self.discount_rewards(delta, self.gamma * self.lam) + + return adv + + def train_batch( + self, + ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]: + """ + Contains the logic for generating trajectory data to train policy and value network + Yield: + Tuple of Lists containing tensors for states, actions, log probs, qvals and advantage + """ + + for step in range(self.steps_per_epoch): + self.state = self.state.to(device=self.device) + + with torch.no_grad(): + pi, action, value = self(self.state) + log_prob = self.actor.get_log_prob(pi, action) + + next_state, reward, done, _ = self.env.step(action.cpu().numpy()) + + self.episode_step += 1 + + self.batch_states.append(self.state) + self.batch_actions.append(action) + self.batch_logp.append(log_prob) + + self.ep_rewards.append(reward) + self.ep_values.append(value.item()) + + self.state = torch.FloatTensor(next_state) + + epoch_end = step == (self.steps_per_epoch - 1) + terminal = len(self.ep_rewards) == self.max_episode_len + + if epoch_end or done or terminal: + # if trajectory ends abtruptly, boostrap value of next state + if (terminal or epoch_end) and not done: + self.state = self.state.to(device=self.device) + with torch.no_grad(): + _, _, value = self(self.state) + last_value = value.item() + steps_before_cutoff = self.episode_step + else: + last_value = 0 + steps_before_cutoff = 0 + + # discounted cumulative reward + self.batch_qvals += self.discount_rewards(self.ep_rewards + [last_value], self.gamma)[:-1] + # advantage + self.batch_adv += self.calc_advantage(self.ep_rewards, self.ep_values, last_value) + # logs + self.epoch_rewards.append(sum(self.ep_rewards)) + # reset params + self.ep_rewards = [] + self.ep_values = [] + self.episode_step = 0 + self.state = torch.FloatTensor(self.env.reset()) + + if epoch_end: + train_data = zip( + self.batch_states, self.batch_actions, self.batch_logp, + self.batch_qvals, self.batch_adv) + + for state, action, logp_old, qval, adv in train_data: + yield state, action, logp_old, qval, adv + + self.batch_states.clear() + self.batch_actions.clear() + self.batch_adv.clear() + self.batch_logp.clear() + self.batch_qvals.clear() + + # logging + self.avg_reward = sum(self.epoch_rewards) / self.steps_per_epoch + + # if epoch ended upbruptly, exlude last cut-short episode to prevent stats skewness + if not done: + total_epoch_reward = sum(self.epoch_rewards[:-1]) + nb_episodes = len(self.epoch_rewards) - 1 + else: + total_epoch_reward = sum(self.epoch_rewards) + nb_episodes = len(self.epoch_rewards) + + self.avg_ep_reward = total_epoch_reward / nb_episodes + self.avg_ep_len = (self.steps_per_epoch - steps_before_cutoff) / nb_episodes + + self.epoch_rewards.clear() + + def actor_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor: + pi, _ = self.actor(state) + logp = self.actor.get_log_prob(pi, action) + ratio = torch.exp(logp - logp_old) + clip_adv = torch.clamp(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * adv + loss_actor = -(torch.min(ratio * adv, clip_adv)).mean() + return loss_actor + + def critic_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor: + value = self.critic(state) + loss_critic = (qval - value).pow(2).mean() + return loss_critic + + def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx, optimizer_idx): + """ + Carries out a single update to actor and critic network from a batch of replay buffer. + + Args: + batch: batch of replay buffer/trajectory data + batch_idx: not used + optimizer_idx: idx that controls optimizing actor or critic network + Returns: + loss + """ + state, action, old_logp, qval, adv = batch + + # normalize advantages + adv = (adv - adv.mean()) / adv.std() + + self.log("avg_ep_len", self.avg_ep_len, prog_bar=True, on_step=False, on_epoch=True) + self.log("avg_ep_reward", self.avg_ep_reward, prog_bar=True, on_step=False, on_epoch=True) + self.log("avg_reward", self.avg_reward, prog_bar=True, on_step=False, on_epoch=True) + + if optimizer_idx == 0: + loss_actor = self.actor_loss(state, action, old_logp, qval, adv) + self.log('loss_actor', loss_actor, on_step=False, on_epoch=True, prog_bar=True, logger=True) + + return loss_actor + + elif optimizer_idx == 1: + loss_critic = self.critic_loss(state, action, old_logp, qval, adv) + self.log('loss_critic', loss_critic, on_step=False, on_epoch=True, prog_bar=False, logger=True) + + return loss_critic + + def configure_optimizers(self) -> List[Optimizer]: + """ Initialize Adam optimizer""" + optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.lr_actor) + optimizer_critic = optim.Adam(self.critic.parameters(), lr=self.lr_critic) + + return optimizer_actor, optimizer_critic + + def optimizer_step(self, *args, **kwargs): + """ + Run 'nb_optim_iters' number of iterations of gradient descent on actor and critic + for each data sample. + """ + for i in range(self.nb_optim_iters): + super().optimizer_step(*args, **kwargs) + + def _dataloader(self) -> DataLoader: + """Initialize the Replay Buffer dataset used for retrieving experiences""" + dataset = ExperienceSourceDataset(self.train_batch) + dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size) + return dataloader + + def train_dataloader(self) -> DataLoader: + """Get train loader""" + return self._dataloader() + + @staticmethod + def add_model_specific_args(parent_parser): # pragma: no-cover + parser = argparse.ArgumentParser(parents=[parent_parser]) + parser.add_argument("--env", type=str, default="CartPole-v0") + parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") + parser.add_argument("--lam", type=float, default=0.95, help="advantage discount factor") + parser.add_argument("--lr_actor", type=float, default=3e-4, help="learning rate of actor network") + parser.add_argument("--lr_critic", type=float, default=1e-3, help="learning rate of critic network") + parser.add_argument("--max_episode_len", type=int, default=1000, help="capacity of the replay buffer") + parser.add_argument("--batch_size", type=int, default=512, help="batch_size when training network") + parser.add_argument("--steps_per_epoch", type=int, default=2048, + help="how many action-state pairs to rollout for trajectory collection per epoch") + parser.add_argument("--nb_optim_iters", type=int, default=4, + help="how many steps of gradient descent to perform on each batch") + parser.add_argument("--clip_ratio", type=float, default=0.2, help="hyperparameter for clipping in the policy objective") + + return parser + + +def main(args) -> None: + model = PPOLightning(**vars(args)) + + trainer = pl.Trainer(max_epochs=70) + trainer.fit(model) + + +if __name__ == '__main__': + cli_lightning_logo() + torch.manual_seed(0) + np.random.seed(0) + + parser = argparse.ArgumentParser(add_help=False) + parser = PPOLightning.add_model_specific_args(parser) + args = parser.parse_args() + + main(args) From b72b19f073c53c57bdb1d6824fc8f9916cba7de8 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 14:27:58 +0530 Subject: [PATCH 02/18] flake --- pl_examples/domain_templates/reinforce_learn_ppo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index c6b9650840026..a69cb0d70bed8 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -459,7 +459,8 @@ def add_model_specific_args(parent_parser): # pragma: no-cover help="how many action-state pairs to rollout for trajectory collection per epoch") parser.add_argument("--nb_optim_iters", type=int, default=4, help="how many steps of gradient descent to perform on each batch") - parser.add_argument("--clip_ratio", type=float, default=0.2, help="hyperparameter for clipping in the policy objective") + parser.add_argument("--clip_ratio", type=float, default=0.2, + help="hyperparameter for clipping in the policy objective") return parser From cf80306cecc45254371cb22bf1cc8328188cf6b4 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 14:42:17 +0530 Subject: [PATCH 03/18] import gym without try as in qnet example --- pl_examples/domain_templates/reinforce_learn_ppo.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index a69cb0d70bed8..4dd52345a6ec2 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -32,6 +32,7 @@ import pytorch_lightning as pl from pl_examples import cli_lightning_logo +import gym import torch from torch import nn from torch.distributions import Categorical, Normal @@ -41,13 +42,6 @@ import numpy as np -try: - import gym -except ModuleNotFoundError: - _GYM_AVAILABLE = False -else: - _GYM_AVAILABLE = True - def create_mlp(input_shape: Tuple[int], n_actions: int, hidden_size: int = 128): """ @@ -194,9 +188,6 @@ def __init__( """ super().__init__() - if not _GYM_AVAILABLE: - raise ModuleNotFoundError('This Module requires gym environment which is not installed yet.') - # Hyperparameters self.lr_actor = lr_actor self.lr_critic = lr_critic From 4ff2eb781d8c8cefbf43c8c874a3b33c85ac5970 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 14:43:24 +0530 Subject: [PATCH 04/18] fix import format --- pl_examples/domain_templates/reinforce_learn_ppo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 4dd52345a6ec2..68fe43057342b 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -33,6 +33,7 @@ from pl_examples import cli_lightning_logo import gym +import numpy as np import torch from torch import nn from torch.distributions import Categorical, Normal @@ -40,8 +41,6 @@ import torch.optim as optim from torch.optim.optimizer import Optimizer -import numpy as np - def create_mlp(input_shape: Tuple[int], n_actions: int, hidden_size: int = 128): """ From 0dd001758487560435f0189373249efe6967b77c Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 14:54:53 +0530 Subject: [PATCH 05/18] remove torch.optim import, not required --- pl_examples/domain_templates/reinforce_learn_ppo.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 68fe43057342b..986f7675b45c0 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -38,7 +38,6 @@ from torch import nn from torch.distributions import Categorical, Normal from torch.utils.data import DataLoader, IterableDataset -import torch.optim as optim from torch.optim.optimizer import Optimizer @@ -412,8 +411,8 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx, opt def configure_optimizers(self) -> List[Optimizer]: """ Initialize Adam optimizer""" - optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.lr_actor) - optimizer_critic = optim.Adam(self.critic.parameters(), lr=self.lr_critic) + optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_actor) + optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.lr_critic) return optimizer_actor, optimizer_critic From 2978369ef6cfc80aa0c6343a071d0f21e55b14d0 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 14:56:11 +0530 Subject: [PATCH 06/18] fix import format isort --- pl_examples/domain_templates/reinforce_learn_ppo.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 986f7675b45c0..2124166c73601 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -27,18 +27,18 @@ [2] https://github.com/openai/spinningup """ import argparse -from typing import List, Tuple, Callable, Iterable - -import pytorch_lightning as pl -from pl_examples import cli_lightning_logo +from typing import Callable, Iterable, List, Tuple import gym import numpy as np import torch from torch import nn from torch.distributions import Categorical, Normal -from torch.utils.data import DataLoader, IterableDataset from torch.optim.optimizer import Optimizer +from torch.utils.data import DataLoader, IterableDataset + +import pytorch_lightning as pl +from pl_examples import cli_lightning_logo def create_mlp(input_shape: Tuple[int], n_actions: int, hidden_size: int = 128): From f92678532b9786c4adfe851fcd02b3e156ba72f7 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 22:01:12 +0530 Subject: [PATCH 07/18] add trainer argparse --- pl_examples/domain_templates/reinforce_learn_ppo.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 2124166c73601..b70249fbf6b64 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -169,6 +169,7 @@ def __init__( steps_per_epoch: int = 2048, nb_optim_iters: int = 4, clip_ratio: float = 0.2, + **kwargs, ) -> None: """ @@ -457,7 +458,7 @@ def add_model_specific_args(parent_parser): # pragma: no-cover def main(args) -> None: model = PPOLightning(**vars(args)) - trainer = pl.Trainer(max_epochs=70) + trainer = pl.Trainer.from_argparse_args(args) trainer.fit(model) @@ -466,8 +467,10 @@ def main(args) -> None: torch.manual_seed(0) np.random.seed(0) - parser = argparse.ArgumentParser(add_help=False) - parser = PPOLightning.add_model_specific_args(parser) + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser = pl.Trainer.add_argparse_args(parent_parser) + + parser = PPOLightning.add_model_specific_args(parent_parser) args = parser.parse_args() main(args) From 5a9c1689d6cc38c11cc8c20dd2041e4cef27a2d6 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 22:02:58 +0530 Subject: [PATCH 08/18] change name of trajectory collection method --- pl_examples/domain_templates/reinforce_learn_ppo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index b70249fbf6b64..6e1591174367a 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -279,7 +279,7 @@ def calc_advantage(self, rewards: List[float], values: List[float], last_value: return adv - def train_batch( + def generate_trajectory_samples( self, ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]: """ @@ -427,7 +427,7 @@ def optimizer_step(self, *args, **kwargs): def _dataloader(self) -> DataLoader: """Initialize the Replay Buffer dataset used for retrieving experiences""" - dataset = ExperienceSourceDataset(self.train_batch) + dataset = ExperienceSourceDataset(self.generate_trajectory_samples) dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size) return dataloader From 70ff08039bf7847fae437a4658f736f18e448347 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 22:09:17 +0530 Subject: [PATCH 09/18] add repo in references --- pl_examples/domain_templates/reinforce_learn_ppo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 6e1591174367a..38d30fa026505 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -25,6 +25,7 @@ ---------- [1] https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py [2] https://github.com/openai/spinningup +[3] https://github.com/sid-sundrani/ppo_lightning """ import argparse from typing import Callable, Iterable, List, Tuple From 4e279c6406b364f7b9b5ec6beedcc62a5185bacc Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Thu, 7 Jan 2021 22:22:13 +0530 Subject: [PATCH 10/18] fix typo in comments --- pl_examples/domain_templates/reinforce_learn_ppo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 38d30fa026505..bc3ac0143932e 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -150,7 +150,7 @@ def __iter__(self) -> Iterable: class PPOLightning(pl.LightningModule): """ - PyTorch Lightning implementation of `PPO. + PyTorch Lightning implementation of PPO. Example: model = PPOLightning("CartPole-v0") From 6e5029dca23a0cb4447f4a76f3fe82602747a80b Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Fri, 8 Jan 2021 15:55:24 +0530 Subject: [PATCH 11/18] use isinstance to verify actionspace type --- pl_examples/domain_templates/reinforce_learn_ppo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index bc3ac0143932e..cda9c5b3aaa6e 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -204,11 +204,11 @@ def __init__( # value network self.critic = create_mlp(self.env.observation_space.shape, 1) # policy network (agent) - if type(self.env.action_space) == gym.spaces.box.Box: + if isinstance(self.env.action_space, gym.spaces.box.Box): act_dim = self.env.action_space.shape[0] actor_mlp = create_mlp(self.env.observation_space.shape, act_dim) self.actor = ActorContinous(actor_mlp, act_dim) - elif type(self.env.action_space) == gym.spaces.discrete.Discrete: + elif isinstance(self.env.action_space, gym.spaces.discrete.Discrete): actor_mlp = create_mlp(self.env.observation_space.shape, self.env.action_space.n) self.actor = ActorCategorical(actor_mlp) else: From fff6f1644981f3a137c1aba85c15bb0e22c029b5 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Fri, 8 Jan 2021 16:07:47 +0530 Subject: [PATCH 12/18] use fstring --- pl_examples/domain_templates/reinforce_learn_ppo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index cda9c5b3aaa6e..c81ffc33bc57b 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -213,7 +213,7 @@ def __init__( self.actor = ActorCategorical(actor_mlp) else: raise NotImplementedError('Env action space should be of type Box (continous) or Discrete (categorical)' - 'Got type: ', type(self.env.action_space)) + f'Got type: {type(self.env.action_space)}') self.batch_states = [] self.batch_actions = [] From 17a94e013c9cc5c4087abe9e62137db688fa611e Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Fri, 8 Jan 2021 16:15:44 +0530 Subject: [PATCH 13/18] deduplication of logic code --- pl_examples/domain_templates/reinforce_learn_ppo.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index c81ffc33bc57b..66055d48d1ac1 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -354,12 +354,12 @@ def generate_trajectory_samples( self.avg_reward = sum(self.epoch_rewards) / self.steps_per_epoch # if epoch ended upbruptly, exlude last cut-short episode to prevent stats skewness + epoch_rewards = self.epoch_rewards if not done: - total_epoch_reward = sum(self.epoch_rewards[:-1]) - nb_episodes = len(self.epoch_rewards) - 1 - else: - total_epoch_reward = sum(self.epoch_rewards) - nb_episodes = len(self.epoch_rewards) + epoch_rewards = epoch_rewards[:-1] + + total_epoch_reward = sum(epoch_rewards) + nb_episodes = len(epoch_rewards) self.avg_ep_reward = total_epoch_reward / nb_episodes self.avg_ep_len = (self.steps_per_epoch - steps_before_cutoff) / nb_episodes From 906344232fcd635d23c72a01f608d5735067d21a Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Fri, 8 Jan 2021 16:17:04 +0530 Subject: [PATCH 14/18] rename unused forloop variable --- pl_examples/domain_templates/reinforce_learn_ppo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 66055d48d1ac1..e74c254b3d33a 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -423,7 +423,7 @@ def optimizer_step(self, *args, **kwargs): Run 'nb_optim_iters' number of iterations of gradient descent on actor and critic for each data sample. """ - for i in range(self.nb_optim_iters): + for _ in range(self.nb_optim_iters): super().optimizer_step(*args, **kwargs) def _dataloader(self) -> DataLoader: From 24e7963519100358b5f4fe6d6014fc6cc4ad72b4 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Fri, 8 Jan 2021 16:20:05 +0530 Subject: [PATCH 15/18] use pl.seed_everything instead --- pl_examples/domain_templates/reinforce_learn_ppo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index e74c254b3d33a..7ea9be8ee9281 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -465,8 +465,7 @@ def main(args) -> None: if __name__ == '__main__': cli_lightning_logo() - torch.manual_seed(0) - np.random.seed(0) + pl.seed_everything(0) parent_parser = argparse.ArgumentParser(add_help=False) parent_parser = pl.Trainer.add_argparse_args(parent_parser) From 6b8281efbe935e92d2f37a069abc02396fa48311 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Fri, 8 Jan 2021 16:23:59 +0530 Subject: [PATCH 16/18] remove unused numpy import --- pl_examples/domain_templates/reinforce_learn_ppo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 7ea9be8ee9281..1367dbc522f83 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -31,7 +31,6 @@ from typing import Callable, Iterable, List, Tuple import gym -import numpy as np import torch from torch import nn from torch.distributions import Categorical, Normal From 70501520e3b843460350ee7d810367d6c94c5464 Mon Sep 17 00:00:00 2001 From: sid-sundrani Date: Fri, 8 Jan 2021 16:25:01 +0530 Subject: [PATCH 17/18] format string printed on error --- pl_examples/domain_templates/reinforce_learn_ppo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 1367dbc522f83..00c8fbc1c97d2 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -211,7 +211,7 @@ def __init__( actor_mlp = create_mlp(self.env.observation_space.shape, self.env.action_space.n) self.actor = ActorCategorical(actor_mlp) else: - raise NotImplementedError('Env action space should be of type Box (continous) or Discrete (categorical)' + raise NotImplementedError('Env action space should be of type Box (continous) or Discrete (categorical). ' f'Got type: {type(self.env.action_space)}') self.batch_states = [] From 809ea73883888e4f6475d063a3fe8ec4a971aad1 Mon Sep 17 00:00:00 2001 From: Sidhant Sundrani Date: Sat, 9 Jan 2021 15:43:41 +0530 Subject: [PATCH 18/18] fix typo in comments --- pl_examples/domain_templates/reinforce_learn_ppo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index 00c8fbc1c97d2..78c4354ef91c2 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -352,7 +352,7 @@ def generate_trajectory_samples( # logging self.avg_reward = sum(self.epoch_rewards) / self.steps_per_epoch - # if epoch ended upbruptly, exlude last cut-short episode to prevent stats skewness + # if epoch ended abruptly, exlude last cut-short episode to prevent stats skewness epoch_rewards = self.epoch_rewards if not done: epoch_rewards = epoch_rewards[:-1]