diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml index 285d97b0e5..a6d808a816 100644 --- a/.github/workflows/ci_test-base.yml +++ b/.github/workflows/ci_test-base.yml @@ -41,14 +41,14 @@ jobs: uses: actions/cache@v2 with: path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/base.txt') }} + key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements.txt') }} restore-keys: | ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip- - name: Install dependencies run: | python -m pip install --upgrade --user pip - pip install --requirement ./requirements/base.txt --quiet --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade + pip install --requirement ./requirements.txt --quiet --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade pip install --requirement ./requirements/test.txt --quiet --upgrade-strategy only-if-needed # pip install tox coverage python --version @@ -66,7 +66,7 @@ jobs: - name: Test Package [only] run: | # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003 - coverage run --source pl_bolts -m pytest pl_bolts -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml --ignore=pl_bolts/datamodules --ignore=pl_bolts/models/self_supervised/amdim/transforms.py + coverage run --source pl_bolts -m pytest pl_bolts -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml --ignore=pl_bolts/datamodules --ignore=pl_bolts/models/self_supervised/amdim/transforms.py --ignore=pl_bolts/models/rl - name: Upload pytest test results uses: actions/upload-artifact@master diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index f4b8f72d88..ba7a661a69 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -45,7 +45,7 @@ jobs: - name: Set min. dependencies if: matrix.requires == 'minimal' run: | - python -c "fpath = 'requirements/base.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)" + python -c "fpath = 'requirements.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)" python -c "fpath = 'requirements/models.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)" python -c "fpath = 'requirements/loggers.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)" python -c "fpath = 'requirements/test.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)" @@ -61,7 +61,7 @@ jobs: uses: actions/cache@v2 with: path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ hashFiles('requirements/base.txt') }}-${{ hashFiles('requirements/modules.txt') }} + key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements/modules.txt') }} restore-keys: | ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}- diff --git a/.github/workflows/code-format.yml b/.github/workflows/code-format.yml index 62f19d3001..813ee1b862 100644 --- a/.github/workflows/code-format.yml +++ b/.github/workflows/code-format.yml @@ -23,14 +23,14 @@ jobs: uses: actions/cache@v2 with: path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('requirements/base.txt') }} + key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} restore-keys: | ${{ runner.os }}-pip- - name: Install dependencies run: | # python -m pip install --upgrade --user pip - pip install -r requirements/base.txt -U -f https://download.pytorch.org/whl/torch_stable.html -q + pip install -r requirements.txt -U -f https://download.pytorch.org/whl/torch_stable.html -q pip install flake8 python --version pip --version diff --git a/.github/workflows/docs-check.yml b/.github/workflows/docs-check.yml index 4de81bdc27..4aaac0d41e 100644 --- a/.github/workflows/docs-check.yml +++ b/.github/workflows/docs-check.yml @@ -36,7 +36,7 @@ jobs: # uses: actions/cache@v2 # with: # path: ~/.cache/pip -# key: ${{ runner.os }}-pip-${{ hashFiles('requirements/base.txt') }} +# key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} # restore-keys: | # ${{ runner.os }}-pip- # @@ -75,13 +75,13 @@ jobs: uses: actions/cache@v2 with: path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('requirements/base.txt') }} + key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} restore-keys: | ${{ runner.os }}-pip- - name: Install dependencies run: | - pip install --requirement requirements/base.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet + pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet pip install --requirement docs/requirements.txt # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux sudo apt-get update && sudo apt-get install -y texlive-latex-extra dvipng texlive-pictures diff --git a/.gitignore b/.gitignore index 2c179cd36b..96f7417d80 100644 --- a/.gitignore +++ b/.gitignore @@ -138,7 +138,6 @@ MNIST # Lightning logs lightning_logs -datasets *.gz *-batches-py simclr.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d28abc99a..fea424b9d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added Linear Regression - Added Moco2g - Added simclr +- Added RL module - Added Loggers - Added Transforms - Added Tiny Datasets @@ -42,12 +43,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Device is no longer set in the DQN model init +- Moved RL loss function to the losses module +- Moved rl.common.experience to datamodules - train_batch function to VPG model to generate batch of data at each step (POC) - Experience source no longer gets initialized with a device, instead the device is passed at each step() - Refactored ExperienceSource classes to be handle multiple environments. ### Removed +- Removed N-Step DQN as the latest version of the DQN supports N-Step by setting the `n_step` arg to n - Deprecated common.experience ### Fixed diff --git a/MANIFEST.in b/MANIFEST.in index d3f4c4f33d..e306b2618d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -25,6 +25,7 @@ recursive-exclude docs * exclude docs # Include the Requirements +include requirements.txt recursive-include requirements *.txt # Exclude build configs diff --git a/README.md b/README.md index 5e905ce783..28f398f7cc 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Install bleeding-edge (no guarantees) pip install git+https://github.com/PytorchLightning/pytorch-lightning-bolts.git@master --upgrade ``` -In case you wan to have full experience you can install all optional packages at once +In case you want to have full experience you can install all optional packages at once ```bash pip install pytorch-lightning-bolts["extra"] ``` diff --git a/docs/source/classic_ml.rst b/docs/source/classic_ml.rst index d3b3c39712..8a1f8b3aa7 100644 --- a/docs/source/classic_ml.rst +++ b/docs/source/classic_ml.rst @@ -9,7 +9,7 @@ half-precision training. Linear Regression ----------------- Linear regression fits a linear model between a real-valued target variable :math:`y` and one or more features :math:`X`. We -estimate the regression coefficients that minimizes the mean squared error between the predicted and true target +estimate the regression coefficients that minimize the mean squared error between the predicted and true target values. We formulate the linear regression model as a single-layer neural network. By default we include only one neuron in @@ -69,7 +69,7 @@ Add either L1 or L2 regularization, or both, by specifying the regularization st trainer.test(test_dataloaders=dm.test_dataloader(batch_size=12)) -Any input will be flattened across all dimensions except the firs one (batch). +Any input will be flattened across all dimensions except the first one (batch). This means images, sound, etc... work out of the box. .. code-block:: python diff --git a/docs/source/conf.py b/docs/source/conf.py index f2dc1442d9..7114016ba1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -328,7 +328,7 @@ def package_list_from_file(file): MOCK_PACKAGES = [] if SPHINX_MOCK_REQUIREMENTS: # mock also base packages when we are on RTD since we don't install them there - MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements', 'base.txt')) + MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements.txt')) MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements', 'models.txt')) MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements', 'loggers.txt')) diff --git a/docs/source/dataloaders.rst b/docs/source/dataloaders.rst index efe932027b..4101003ba7 100644 --- a/docs/source/dataloaders.rst +++ b/docs/source/dataloaders.rst @@ -3,7 +3,10 @@ AsynchronousLoader This dataloader behaves identically to the standard pytorch dataloader, but will transfer data asynchronously to the GPU with training. You can also use it to wrap an existing dataloader. -Example:: +Example: + +.. code-block:: python + dataloader = AsynchronousLoader(DataLoader(ds, batch_size=16), device=device) for b in dataloader: @@ -11,11 +14,3 @@ Example:: .. autoclass:: pl_bolts.datamodules.async_dataloader.AsynchronousLoader :noindex: - ------------------- - -DummyDataset ------------- - -.. autoclass:: pl_bolts.datamodules.dummy_dataset.DummyDataset - :noindex: diff --git a/docs/source/datamodules.rst b/docs/source/datamodules.rst index 6468326e5c..94c7fc28d8 100644 --- a/docs/source/datamodules.rst +++ b/docs/source/datamodules.rst @@ -7,9 +7,9 @@ DataModules (introduced in PyTorch Lightning 0.9.0) decouple the data from a mod is simply a collection of a training dataloder, val dataloader and test dataloader. In addition, it specifies how to: -- Downloading/preparing data. +- Download/prepare data. - Train/val/test splits. -- Transforms +- Transform Then you can use it like this: diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst new file mode 100644 index 0000000000..4e54095022 --- /dev/null +++ b/docs/source/datasets.rst @@ -0,0 +1,41 @@ +######## +Datasets +######## +Collection of useful datasets + +-------- + +********* +Debugging +********* +Use these datasets to debug + +DummyDataset +============ + +.. autoclass:: pl_bolts.datasets.dummy_dataset.DummyDataset + :noindex: + +DummyDetectionDataset +===================== + +.. autoclass:: pl_bolts.datasets.dummy_dataset.DummyDetectionDataset + :noindex: + +RandomDataset +============= + +.. autoclass:: pl_bolts.datasets.dummy_dataset.RandomDataset + :noindex: + +RandomDictDataset +================= + +.. autoclass:: pl_bolts.datasets.dummy_dataset.RandomDictDataset + :noindex: + +RandomDictStringDataset +======================= + +.. autoclass:: pl_bolts.datasets.dummy_dataset.RandomDictStringDataset + :noindex: diff --git a/docs/source/index.rst b/docs/source/index.rst index 001990c191..cf45a1d243 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -33,6 +33,13 @@ PyTorch-Lightning-Bolts documentation sklearn_datamodule vision_datamodules +.. toctree:: + :maxdepth: 2 + :name: datasets + :caption: Datasets + + datasets + .. toctree:: :maxdepth: 2 :name: dataloaders @@ -53,10 +60,17 @@ PyTorch-Lightning-Bolts documentation :caption: Models models_howto - autoencoders classic_ml + +.. toctree:: + :maxdepth: 2 + :name: vision + :caption: Vision models + + autoencoders convolutional gans + reinforce_learn self_supervised_models .. toctree:: @@ -90,6 +104,7 @@ Indices and tables readme api/pl_bolts.callbacks api/pl_bolts.datamodules + api/pl_bolts.datasets api/pl_bolts.metrics api/pl_bolts.models api/pl_bolts.callbacks diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index 2ff923a911..a16ba08818 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -10,7 +10,7 @@ Bolts is a Deep learning research and production toolbox of: - Losses. - Datasets. -**The Main goal of bolts is to enable trying new ideas as fast as possible!** +**The Main goal of Bolts is to enable trying new ideas as fast as possible!** All models are tested (daily), benchmarked, documented and work on CPUs, TPUs, GPUs and 16-bit precision. @@ -90,11 +90,11 @@ All models are tested (daily), benchmarked, documented and work on CPUs, TPUs, G Community Built --------------- -Bolts are built-by the Lightning community and contributed to bolts. +Then lightning community builds bolts and contributes them to Bolts. The lightning team guarantees that contributions are: -1. Rigorously Tested (CPUs, GPUs, TPUs). -2. Rigorously Documented. +1. Rigorously tested (CPUs, GPUs, TPUs). +2. Rigorously documented. 3. Standardized via PyTorch Lightning. 4. Optimized for speed. 5. Checked for correctness. @@ -351,7 +351,7 @@ In case your job or research doesn't need a "hammer", we offer implementations o which benefit from lightning's multi-GPU and TPU support. So, now you can run huge workloads scalably, without needing to do any engineering. -For instance, here we can run Logistic Regression on Imagenet (each epoch takes about 3 minutes)! +For instance, here we can run logistic Regression on Imagenet (each epoch takes about 3 minutes)! .. code-block:: python @@ -414,7 +414,7 @@ But more importantly, you can scale up to many GPUs, TPUs or even CPUs Logistic Regression ^^^^^^^^^^^^^^^^^^^ -Here's an example for Logistic regression +Here's an example for logistic regression .. code-block:: python @@ -436,7 +436,7 @@ Here's an example for Logistic regression trainer.test(test_dataloaders=dm.test_dataloader(batch_size=12)) -Any input will be flattened across all dimensions except the firs one (batch). +Any input will be flattened across all dimensions except the first one (batch). This means images, sound, etc... work out of the box. .. code-block:: python diff --git a/docs/source/losses.rst b/docs/source/losses.rst index 3f2b120fee..44b401dfcc 100644 --- a/docs/source/losses.rst +++ b/docs/source/losses.rst @@ -10,3 +10,33 @@ This package lists common losses across research domains Your Loss --------- We're cleaning up many of our losses, but in the meantime, submit a PR to add your loss here! + +------------- + +Reinforcement Learning +====================== +These are common losses used in RL. + +--------------- + +DQN Loss +-------- + +.. autofunction:: pl_bolts.losses.rl.dqn_loss + :noindex: + +--------------- + +Double DQN Loss +--------------- + +.. autofunction:: pl_bolts.losses.rl.double_dqn_loss + :noindex: + +--------------- + +Per DQN Loss +------------ + +.. autofunction:: pl_bolts.losses.rl.per_dqn_loss + :noindex: diff --git a/docs/source/models.rst b/docs/source/models.rst index 09ae1888c5..924b39de4d 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -15,7 +15,7 @@ by adding your contribution to bolts you get these **additional** benefits! 6. We'll pretrain expensive models for you and host weights. 7. We will improve the speed of your models! 8. Eligible for invited talks to discuss your implementation. - 9. Lightning Swag + involvement in the broader contributor community :) + 9. Lightning swag + involvement in the broader contributor community :) .. note:: You still get to keep your attribution and be recognized for your work! @@ -98,7 +98,7 @@ We request that each contribution have: - Your name and your team's name as the implementation authors. - Your team's affiliation - Any generated examples, or result plots. - - Hyperparameters configurations for the results. + - Hyperparameter configurations for the results. Thank you for all your amazing contributions! diff --git a/docs/source/reinforce_learn.rst b/docs/source/reinforce_learn.rst new file mode 100644 index 0000000000..4737b60764 --- /dev/null +++ b/docs/source/reinforce_learn.rst @@ -0,0 +1,668 @@ +Reinforcement Learning +====================== + +This module is a collection of common RL approaches implemented in Lightning. + +----------------- + +Module authors +-------------- + +Contributions by: `Donal Byrne `_ + +- DQN +- Double DQN +- Dueling DQN +- Noisy DQN +- NStep DQN +- Prioritized Experience Replay DQN +- Reinforce +- Vanilla Policy Gradient + +------------ + +.. note:: + RL models currently only support CPU and single GPU training with `distributed_backend=dp`. + Full GPU support will be added in later updates. + + +DQN Models +---------- + +The following models are based on DQN. DQN uses value based learning where it is deciding what action to take based +on the model's current learned value (V), or the state action value (Q) of the current state. These values are defined +as the discounted total reward of the agents state or state action pair. + +--------------- + +Deep-Q-Network (DQN) +^^^^^^^^^^^^^^^^^^^^ + +DQN model introduced in `Playing Atari with Deep Reinforcement Learning `_. +Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. + +Original implementation by: `Donal Byrne `_ + +The DQN was introduced in `Playing Atari with Deep Reinforcement Learning `_ by +researchers at DeepMind. This took the concept of tabular Q learning and scaled it to much larger problems by +apporximating the Q function using a deep neural network. + +The goal behind DQN was to take the simple control method of Q learning and scale it up in order to solve complicated \ +tasks. As well as this, the method needed to be stable. The DQN solves these issues with the following additions. + +**Approximated Q Function** + +Storing Q values in a table works well in theory, but is completely unscalable. Instead, the authors approximate the +Q function using a deep neural network. This allows the DQN to be used for much more complicated tasks + +**Replay Buffer** + +Similar to supervised learning, the DQN learns on randomly sampled batches of previous data stored in an +Experience Replay Buffer. The 'target' is calculated using the Bellman equation + +.. math:: + + Q(s,a)<-(r+{\gamma}\max_{a'{\in}A}Q(s',a'))^2 + +and then we optimize using SGD just like a standard supervised learning problem. + +.. math:: + + L=(Q(s,a)-(r+{\gamma}\max_{a'{\in}A}Q(s',a'))^2 + +DQN Results +~~~~~~~~~~~ + +**DQN: Pong** + +.. image:: _images/rl_benchmark/pong_dqn_baseline_results.jpg + :width: 800 + :alt: DQN Baseline Results + +Example:: + + from pl_bolts.models.rl import DQN + dqn = DQN("PongNoFrameskip-v4") + trainer = Trainer() + trainer.fit(dqn) + +.. autoclass:: pl_bolts.models.rl.dqn_model.DQN + :noindex: + +--------------- + +Double DQN +^^^^^^^^^^ + +Double DQN model introduced in `Deep Reinforcement Learning with Double Q-learning `_ +Paper authors: Hado van Hasselt, Arthur Guez, David Silver + +Original implementation by: `Donal Byrne `_ + +The original DQN tends to overestimate Q values during the Bellman update, leading to instability and is harmful to +training. This is due to the max operation in the Bellman equation. + +We are constantly taking the max of our agents estimates +during our update. This may seem reasonable, if we could trust these estimates. However during the early stages of +training, the estimates for these values will be off center and can lead to instability in training until +our estimates become more reliable + +The Double DQN fixes this overestimation by choosing actions for the next state using the main trained network +but uses the values of these actions from the more stable target network. So we are still going to take the greedy +action, but the value will be less "optimisitc" because it is chosen by the target network. + +**DQN expected return** + + +.. math:: + + Q(s_t, a_t) = r_t + \gamma * \max_{Q'}(S_{t+1}, a) + +**Double DQN expected return** + +.. math:: + + Q(s_t, a_t) = r_t + \gamma * \max{Q'}(S_{t+1}, \arg\max_Q(S_{t+1}, a)) + +Double DQN Results +~~~~~~~~~~~~~~~~~~ + +**Double DQN: Pong** + +.. image:: _images/rl_benchmark/pong_double_dqn_baseline_results.jpg + :width: 800 + :alt: Double DQN Result + +**DQN vs Double DQN: Pong** + +orange: DQN + +blue: Double DQN + +.. image:: _images/rl_benchmark/dqn_ddqn_comparison.jpg + :width: 800 + :alt: Double DQN Comparison Result + +Example:: + + from pl_bolts.models.rl import DoubleDQN + ddqn = DoubleDQN("PongNoFrameskip-v4") + trainer = Trainer() + trainer.fit(ddqn) + +.. autoclass:: pl_bolts.models.rl.double_dqn_model.DoubleDQN + :noindex: + +--------------- + +Dueling DQN +^^^^^^^^^^^ + +Dueling DQN model introduced in `Dueling Network Architectures for Deep Reinforcement Learning `_ +Paper authors: Ziyu Wang, Tom Schaul, Matteo Hessel, Hado van Hasselt, Marc Lanctot, Nando de Freitas + +Original implementation by: `Donal Byrne `_ + +The Q value that we are trying to approximate can be divided into two parts, the value state V(s) and the 'advantage' +of actions in that state A(s, a). Instead of having one full network estimate the entire Q value, Dueling DQN uses two +estimator heads in order to separate the estimation of the two parts. + +The value is the same as in value iteration. It is the discounted expected reward achieved from state s. Think of the +value as the 'base reward' from being in state s. + +The advantage tells us how much 'extra' reward we get from taking action a while in state s. The advantage bridges the +gap between Q(s, a) and V(s) as Q(s, a) = V(s) + A(s, a). + +In the paper `Dueling Network Architectures for Deep Reinforcement Learning ` the +network uses two heads, one outputs the value state and the other outputs the advantage. This leads to better +training stability, faster convergence and overall better results. The V head outputs a single scalar +(the state value), while the advantage head outputs a tensor equal to the size of the action space, containing +an advantage value for each action in state s. + +Changing the network architecture is not enough, we also need to ensure that the advantage mean is 0. This is done +by subtracting the mean advantage from the Q value. This essentially pulls the mean advantage to 0. + +.. math:: + + Q(s, a) = V(s) + A(s, a) - 1/N * \sum_k(A(s, k) + +Dueling DQN Benefits +~~~~~~~~~~~~~~~~~~~~ + +- Ability to efficiently learn the state value function. In the dueling network, every Q update also updates the value + stream, where as in DQN only the value of the chosen action is updated. This provides a better approximation of the + values +- The differences between total Q values for a given state are quite small in relation to the magnitude of Q. The + difference in the Q values between the best action and the second best action can be very small, while the average + state value can be much larger. The differences in scale can introduce noise, which may lead to the greedy policy + switching the priority of these actions. The seperate estimators for state value and advantage makes the Dueling + DQN robust to this type of scenario + +Dueling DQN Results +~~~~~~~~~~~~~~~~~~~ + +The results below a noticeable improvement from the original DQN network. + + +**Dueling DQN baseline: Pong** + +Similar to the results of the DQN baseline, the agent has a period where the number of steps per episodes increase as +it begins to hold its own against the heuristic oppoent, but then the steps per episode quickly begins to drop +as it gets better and starts to beat its opponent faster and faster. There is a noticable point at step ~250k +where the agent goes from losing to winning. + +As you can see by the total rewards, the dueling network's training progression is very stable and continues to trend +upward until it finally plateus. + +.. image:: _images/rl_benchmark/pong_dueling_dqn_results.jpg + :width: 800 + :alt: Dueling DQN Result + +**DQN vs Dueling DQN: Pong** + +In comparison to the base DQN, we see that the Dueling network's training is much more stable and is able to reach a +score in the high teens faster than the DQN agent. Even though the Dueling network is more stable and out performs DQN +early in training, by the end of training the two networks end up at the same point. + +This could very well be due to the simplicity of the Pong environment. + + - Orange: DQN + - Red: Dueling DQN + +.. image:: _images/rl_benchmark/pong_dueling_dqn_comparison.jpg + :width: 800 + :alt: Dueling DQN Comparison Result + +Example:: + + from pl_bolts.models.rl import DuelingDQN + dueling_dqn = DuelingDQN("PongNoFrameskip-v4") + trainer = Trainer() + trainer.fit(dueling_dqn) + +.. autoclass:: pl_bolts.models.rl.dueling_dqn_model.DuelingDQN + :noindex: + +-------------- + +Noisy DQN +^^^^^^^^^ + +Noisy DQN model introduced in `Noisy Networks for Exploration `_ +Paper authors: Meire Fortunato, Mohammad Gheshlaghi Azar, Bilal Piot, Jacob Menick, Ian Osband, Alex Graves, +Vlad Mnih, Remi Munos, Demis Hassabis, Olivier Pietquin, Charles Blundell, Shane Legg + +Original implementation by: `Donal Byrne `_ + +Up until now the DQN agent uses a seperate exploration policy, generally epsilon-greedy where start and end values +are set for its exploration. `Noisy Networks For Exploration ` introduces +a new exploration strategy by adding noise parameters to the weights of the fully connect layers which get updated +during backpropagation of the network. The noise parameters drive +the exploration of the network instead of simply taking random actions more frequently at the start of training and +less frequently towards the end. The of authors of +propose two ways of doing this. + +During the optimization step a new set of noisy parameters are sampled. During training the agent acts according to +the fixed set of parameters. At the next optimization step, the parameters are updated with a new sample. This ensures +the agent always acts based on the parameters that are drawn from the current noise +distribution. + +The authors propose two methods of injecting noise to the network. + +1) Independent Gaussian Noise: This injects noise per weight. For each weight a random value is taken from + the distribution. Noise parameters are stored inside the layer and are updated during backpropagation. + The output of the layer is calculated as normal. +2) Factorized Gaussian Noise: This injects nosier per input/ouput. In order to minimize the number of random values + this method stores two random vectors, one with the size of the input and the other with the size of the output. + Using these two vectors, a random matrix is generated for the layer by calculating the outer products of the vector + + +Noisy DQN Benefits +~~~~~~~~~~~~~~~~~~ + +- Improved exploration function. Instead of just performing completely random actions, we add decreasing amount of noise + and uncertainty to our policy allowing to explore while still utilising its policy. +- The fact that this method is automatically tuned means that we do not have to tune hyper parameters for + epsilon-greedy! + +.. note:: + For now I have just implemented the Independant Gaussian as it has been reported there isn't much difference + in results for these benchmark environments. + +In order to update the basic DQN to a Noisy DQN we need to do the following + +Noisy DQN Results +~~~~~~~~~~~~~~~~~ + +The results below improved stability and faster performance growth. + +**Noisy DQN baseline: Pong** + + +Similar to the other improvements, the average score of the agent reaches positive numbers around the 250k mark and +steadily increases till convergence. + +.. image:: _images/rl_benchmark/pong_noisy_dqn_results.jpg + :width: 800 + :alt: Noisy DQN Result + +**DQN vs Dueling DQN: Pong** + +In comparison to the base DQN, the Noisy DQN is more stable and is able to converge on an optimal policy much faster +than the original. It seems that the replacement of the epsilon-greedy strategy with network noise provides a better +form of exploration. + +- Orange: DQN +- Red: Noisy DQN + +.. image:: _images/rl_benchmark/pong_noisy_dqn_comparison.jpg + :width: 800 + :alt: Noisy DQN Comparison Result + +Example:: + + from pl_bolts.models.rl import NoisyDQN + noisy_dqn = NoisyDQN("PongNoFrameskip-v4") + trainer = Trainer() + trainer.fit(noisy_dqn) + +.. autoclass:: pl_bolts.models.rl.noisy_dqn_model.NoisyDQN + :noindex: + +-------------- + +N-Step DQN +^^^^^^^^^^ + +N-Step DQN model introduced in `Learning to Predict by the Methods of Temporal Differences `_ +Paper authors: Richard S. Sutton + +Original implementation by: `Donal Byrne `_ + +N Step DQN was introduced in `Learning to Predict by the Methods of Temporal Differences +`_. +This method improves upon the original DQN by updating our Q values with the expected reward from multiple steps in the +future as opposed to the expected reward from the immediate next state. When getting the Q values for a state action +pair using a single step which looks like this + +.. math:: + + Q(s_t,a_t)=r_t+{\gamma}\max_aQ(s_{t+1},a_{t+1}) + +but because the Q function is recursive we can continue to roll this out into multiple steps, looking at the expected +return for each step into the future. + +.. math:: + + Q(s_t,a_t)=r_t+{\gamma}r_{t+1}+{\gamma}^2\max_{a'}Q(s_{t+2},a') + +The above example shows a 2-Step look ahead, but this could be rolled out to the end of the episode, which is just +Monte Carlo learning. Although we could just do a monte carlo update and look forward to the end of the episode, it +wouldn't be a good idea. Every time we take another step into the future, we are basing our approximation off our +current policy. For a large portion of training, our policy is going to be less than optimal. For example, at the start +of training, our policy will be in a state of high exploration, and will be little better than random. + +.. note:: + For each rollout step you must scale the discount factor accordingly by the number of steps. As you can see from the + equation above, the second gamma value is to the power of 2. If we rolled this out one step further, we would use + gamma to the power of 3 and so. + +So if we are aproximating future rewards off a bad policy, chances are those approximations are going to be pretty +bad and every time we unroll our update equation, the worse it will get. The fact that we are using an off policy +method like DQN with a large replay buffer will make this even worse, as there is a high chance that we will be +training on experiences using an old policy that was worse than our current policy. + +So we need to strike a balance between looking far enough ahead to improve the convergence of our agent, but not so far +that are updates become unstable. In general, small values of 2-4 work best. + +N-Step Benefits +~~~~~~~~~~~~~~~ + +- Multi-Step learning is capable of learning faster than typical 1 step learning methods. +- Note that this method introduces a new hyperparameter n. Although n=4 is generally a good starting point and provides + good results across the board. + +N-Step Results +~~~~~~~~~~~~~~ + +As expected, the N-Step DQN converges much faster than the standard DQN, however it also adds more instability to the +loss of the agent. This can be seen in the following experiments. + + +**N-Step DQN: Pong** + +The N-Step DQN shows the greatest increase in performance with respect to the other DQN variations. +After less than 150k steps the agent begins to consistently win games and achieves the top score after ~170K steps. +This is reflected in the sharp peak of the total episode steps and of course, the total episode rewards. + +.. image:: _images/rl_benchmark/pong_nstep_dqn_1.jpg + :width: 800 + :alt: N-Step DQN Result + +**DQN vs N-Step DQN: Pong** + +This improvement is shown in stark contrast to the base DQN, which only begins to win games after 250k steps and +requires over twice as many steps (450k) as the N-Step agent to achieve the high score of 21. One important thing to +notice is the large increase in the loss of the N-Step agent. This is expected as the agent is building +its expected reward off approximations of the future states. The large the size of N, the greater the instability. +Previous literature, listed below, shows the best results for the Pong environment with an N step between 3-5. +For these experiments I opted with an N step of 4. + + +.. image:: _images/rl_benchmark/pong_nstep_dqn_2.jpg + :width: 800 + :alt: N-Step DQN Comparison Results + +Example:: + + from pl_bolts.models.rl import DQN + n_step_dqn = DQN("PongNoFrameskip-v4", n_steps=4) + trainer = Trainer() + trainer.fit(n_step_dqn) + +-------------- + +Prioritized Experience Replay DQN +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Double DQN model introduced in `Prioritized Experience Replay `_ +Paper authors: Tom Schaul, John Quan, Ioannis Antonoglou, David Silver + +Original implementation by: `Donal Byrne `_ + +The standard DQN uses a buffer to break up the correlation between experiences and uniform random samples for each +batch. Instead of just randomly sampling from the buffer prioritized experience replay (PER) prioritizes these samples +based on training loss. This concept was introduced in the paper +`Prioritized Experience Replay `__ + +Essentially we want to train more on the samples that sunrise the agent. + +The priority of each sample is defined below where + + +.. math:: + + P(i) = P^\alpha_i / \sum_k P_k^\alpha + + +where pi is the priority of the ith sample in the buffer and +𝛼 is the number that shows how much emphasis we give to the priority. If 𝛼 = 0 , our +sampling will become uniform as in the classic DQN method. Larger values for 𝛼 put +more stress on samples with higher priority + +Its important that new samples are set to the highest priority so that they are sampled soon. This however introduces +bias to new samples in our dataset. In order to compensate for this bias, the value of the weight is defined as + +.. math:: + + w_i=(N . P(i))^{-\beta} + +Where beta is a hyper parameter between 0-1. When beta is 1 the bias is fully compensated. However authors noted that +in practice it is better to start beta with a small value near 0 and slowly increase it to 1. + +PER Benefits +~~~~~~~~~~~~ + +- The benefits of this technique are that the agent sees more samples that it struggled with and gets more + chances to improve upon it. + +**Memory Buffer** + + +First step is to replace the standard experience replay buffer with the prioritized experience replay buffer. This +is pretty large (100+ lines) so I wont go through it here. There are two buffers implemented. The first is a naive +list based buffer found in memory.PERBuffer and the second is more efficient buffer using a Sum Tree datastructure. + +The list based version is simpler, but has a sample complexity of O(N). The Sum Tree in comparison has a complexity +of O(1) for sampling and O(logN) for updating priorities. + +**Update loss function** + +The next thing we do is to use the sample weights that we get from PER. Add the following code to the end of the +loss function. This applies the weights of our sample to the batch loss. Then we return the mean loss and weighted loss +for each datum, with the addition of a small epsilon value. + + +PER Results +~~~~~~~~~~~ + +The results below show improved stability and faster performance growth. + +**PER DQN: Pong** + +Similar to the other improvements, we see that PER improves the stability of the agents training and shows to converged +on an optimal policy faster. + +.. image:: _images/rl_benchmark/pong_per_dqn_baseline_v1_results.jpg + :width: 800 + :alt: PER DQN Results + +**DQN vs PER DQN: Pong** + +In comparison to the base DQN, the PER DQN does show improved stability and performance. As expected, the loss +of the PER DQN is siginificantly lower. This is the main objective of PER by focusing on experiences with high loss. + +It is important to note that loss is not the only metric we should be looking at. Although the agent may have very +low loss during training, it may still perform poorly due to lack of exploration. + +.. image:: _images/rl_benchmark/pong_per_dqn_baseline_v1_results_comp.jpg + :width: 800 + :alt: PER DQN Results + +- Orange: DQN +- Pink: PER DQN + +Example:: + + from pl_bolts.models.rl import PERDQN + per_dqn = PERDQN("PongNoFrameskip-v4") + trainer = Trainer() + trainer.fit(per_dqn) + +.. autoclass:: pl_bolts.models.rl.per_dqn_model.PERDQN + :noindex: + + +-------------- + +Policy Gradient Models +---------------------- +The following models are based on Policy Gradients. Unlike the Q learning models shown before, Policy based models +do not try and learn the specifc values of state or state action pairs. Instead it cuts out the middle man and +directly learns the policy distribution. In Policy Gradient models we update our network parameters in the direction +suggested by our policy gradient in order to find a policy that produces the highest results. + +Policy Gradient Key Points: + - Outputs a distribution of actions instead of discrete Q values + - Optimizes the policy directly, instead of indirectly through the optimization of Q values + - The policy distribution of actions allows the model to handle more complex action spaces, such as continuous actions + - The policy distribution introduces stochasticity, providing natural exploration to the model + - The policy distribution provides a more stable update as a change in weights will only change the total distribution + slightly, as opposed to changing weights based on the Q value of state S will change all Q values with similar states. + - Policy gradients tend to converge faste, however they are not as sample efficient and generally require more + interactions with the environment. + + +-------------- + +REINFORCE +^^^^^^^^^ + +REINFORCE model introduced in `Policy Gradient Methods For Reinforcement Learning With Function Approximation `_ +Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour + +Original implementation by: `Donal Byrne `_ + +REINFORCE is one of the simplest forms of the Policy Gradient method of RL. This method uses a Monte Carlo rollout, +where its steps through entire episodes of the environment to build up trajectories computing the total rewards. The +algorithm is as follows: + +1. Initialize our network. +2. Play N full episodes saving the transitions through the environment. +3. For every step `t` in each episode `k` we calculate the discounted reward of the subsequent steps. + +.. math:: + + Q_{k,t} = \sum_{i=0}\gamma^i r_i + +4. Calculate the loss for all transitions. + +.. math:: + + L = - \sum_{k,t} Q_{k,t} \log(\pi(S_{k,t}, A_{k,t})) + +5. Perform SGD on the loss and repeat. + +What this loss function is saying is simply that we want to take the log probability of action A at state S given +our policy (network output). This is then scaled by the discounted reward that we calculated in the previous step. +We then take the negative of our sum. This is because the loss is minimized during SGD, but we want to +maximize our policy. + +.. note:: + The current implementation does not actually wait for the batch episodes the complete every time as we pass in a + fixed batch size. For the time being we simply use a large batch size to accomodate this. This approach still works + well for simple tasks as it still manages to get an accurate Q value by using a large batch size, but it is not + as accurate or completely correct. This will be updated in a later version. + + +REINFORCE Benefits +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Simple and straightforward + +- Computationally more efficient for simple tasks such as Cartpole than the Value Based methods. + +REINFORCE Results +~~~~~~~~~~~~~~~~~~~~~ + +Hyperparameters: + +- Batch Size: 800 +- Learning Rate: 0.01 +- Episodes Per Batch: 4 +- Gamma: 0.99 + +TODO: Add results graph + +Example:: + + from pl_bolts.models.rl import Reinforce + reinforce = Reinforce("CartPole-v0") + trainer = Trainer() + trainer.fit(reinforce) + +.. autoclass:: pl_bolts.models.rl.reinforce_model.Reinforce + :noindex: + +-------------- + +Vanilla Policy Gradient +^^^^^^^^^^^^^^^^^^^^^^^ + +Vanilla Policy Gradient model introduced in `Policy Gradient Methods For Reinforcement Learning With Function Approximation `_ +Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour + +Original implementation by: `Donal Byrne `_ + +Vanilla Policy Gradient (VPG) expands upon the REINFORCE algorithm and improves some of its major issues. The major +issue with REINFORCE is that it has high variance. This can be improved by subtracting a baseline value from the +Q values. For this implementation we use the average reward as our baseline. + +Although Policy Gradients are able to explore naturally due to the stochastic nature of the model, the agent can still +frequently be stuck in a local optima. In order to improve this, VPG adds an entropy term to improve exploration. + +.. math:: + + H(\pi) = - \sum \pi (a | s) \log \pi (a | s) + +To further control the amount of additional entropy in our model we scale the entropy term by a small beta value. The +scaled entropy is then subtracted from the policy loss. + +VPG Benefits +~~~~~~~~~~~~~~~ + +- Addition of the baseline reduces variance in the model + +- Improved exploration due to entropy bonus + +VPG Results +~~~~~~~~~~~~~~~~ + +Hyperparameters: + +- Batch Size: 8 +- Learning Rate: 0.001 +- N Steps: 10 +- N environments: 4 +- Entropy Beta: 0.01 +- Gamma: 0.99 + +Example:: + + from pl_bolts.models.rl import VanillaPolicyGradient + vpg = VanillaPolicyGradient("CartPole-v0") + trainer = Trainer() + trainer.fit(vpg) + +.. autoclass:: pl_bolts.models.rl.vanilla_policy_gradient_model.VanillaPolicyGradient + :noindex: diff --git a/pl_bolts/__init__.py b/pl_bolts/__init__.py index 271653f015..25f14672bd 100644 --- a/pl_bolts/__init__.py +++ b/pl_bolts/__init__.py @@ -2,7 +2,7 @@ import os -__version__ = '0.2.2' +__version__ = '0.2.5' __author__ = 'PyTorchLightning et al.' __author_email__ = 'name@pytorchlightning.ai' __license__ = 'Apache-2.0' @@ -45,12 +45,13 @@ else: # from pl_bolts.models.mnist_module import LitMNIST - from pl_bolts import models, metrics, callbacks, datamodules, transforms + from pl_bolts import models, metrics, callbacks, datamodules, transforms, datasets __all__ = [ # 'LitMNIST', 'models', 'metrics', 'callbacks', - 'datamodules' + 'datamodules', + 'datasets', ] diff --git a/pl_bolts/datamodules/__init__.py b/pl_bolts/datamodules/__init__.py index 1dd2e7c9aa..2e3447d2ac 100644 --- a/pl_bolts/datamodules/__init__.py +++ b/pl_bolts/datamodules/__init__.py @@ -1,5 +1,4 @@ from pl_bolts.datamodules.async_dataloader import AsynchronousLoader -from pl_bolts.datamodules.dummy_dataset import DummyDataset, DummyDetectionDataset try: from pl_bolts.datamodules.binary_mnist_datamodule import BinaryMNISTDataModule @@ -7,6 +6,11 @@ CIFAR10DataModule, TinyCIFAR10DataModule, ) + from pl_bolts.datamodules.experience_source import ( + ExperienceSourceDataset, + ExperienceSource, + DiscountedExperienceSource, + ) from pl_bolts.datamodules.fashion_mnist_datamodule import FashionMNISTDataModule from pl_bolts.datamodules.imagenet_datamodule import ImagenetDataModule from pl_bolts.datamodules.mnist_datamodule import MNISTDataModule diff --git a/pl_bolts/datamodules/cifar10_dataset.py b/pl_bolts/datamodules/cifar10_dataset.py index 63d2f1f744..5ddb44ab36 100644 --- a/pl_bolts/datamodules/cifar10_dataset.py +++ b/pl_bolts/datamodules/cifar10_dataset.py @@ -87,6 +87,9 @@ def __init__( self.train = train # training set or test set self.transform = transform + if not _PIL_AVAILABLE: + raise ImportError('You want to use PIL.Image for loading but it is not installed yet.') + os.makedirs(self.cached_folder_path, exist_ok=True) self.prepare_data(download) diff --git a/pl_bolts/datamodules/dummy_dataset.py b/pl_bolts/datamodules/dummy_dataset.py deleted file mode 100644 index 771a7fdbb7..0000000000 --- a/pl_bolts/datamodules/dummy_dataset.py +++ /dev/null @@ -1,68 +0,0 @@ -import torch -from torch.utils.data import Dataset, DataLoader - - -class DummyDataset(Dataset): - def __init__(self, *shapes, num_samples=10000): - """ - Generate a dummy dataset - - Args: - *shapes: list of shapes - num_samples: how many samples to use in this dataset - - Example:: - - from pl_bolts.datamodules import DummyDataset - - # mnist dims - >>> ds = DummyDataset((1, 28, 28), (1,)) - >>> dl = DataLoader(ds, batch_size=7) - ... - >>> batch = next(iter(dl)) - >>> x, y = batch - >>> x.size() - torch.Size([7, 1, 28, 28]) - >>> y.size() - torch.Size([7, 1]) - """ - super().__init__() - self.shapes = shapes - self.num_samples = num_samples - - def __len__(self): - return self.num_samples - - def __getitem__(self, idx): - samples = [] - for shape in self.shapes: - sample = torch.rand(*shape) - samples.append(sample) - - return samples - - -class DummyDetectionDataset(Dataset): - def __init__( - self, img_shape=(3, 256, 256), num_boxes=1, num_classes=2, num_samples=10000 - ): - super().__init__() - self.img_shape = img_shape - self.num_samples = num_samples - self.num_boxes = num_boxes - self.num_classes = num_classes - - def __len__(self): - return self.num_samples - - def _random_bbox(self): - c, h, w = self.img_shape - xs = torch.randint(w, (2,)) - ys = torch.randint(h, (2,)) - return [min(xs), min(ys), max(xs), max(ys)] - - def __getitem__(self, idx): - img = torch.rand(self.img_shape) - boxes = torch.tensor([self._random_bbox() for _ in range(self.num_boxes)]) - labels = torch.randint(self.num_classes, (self.num_boxes,)) - return img, {"boxes": boxes, "labels": labels} diff --git a/pl_bolts/datamodules/experience_source.py b/pl_bolts/datamodules/experience_source.py new file mode 100644 index 0000000000..6a4671234f --- /dev/null +++ b/pl_bolts/datamodules/experience_source.py @@ -0,0 +1,278 @@ +""" +Datamodules for RL models that rely on experiences generated during training +Based on implementations found here: https://github.com/Shmuma/ptan/blob/master/ptan/experience.py +""" +from abc import ABC +from collections import deque, namedtuple +from typing import Iterable, Callable, Tuple, List + +import torch +from gym import Env +from torch.utils.data import IterableDataset + +# Datasets + +Experience = namedtuple( + "Experience", field_names=["state", "action", "reward", "done", "new_state"] +) + + +class ExperienceSourceDataset(IterableDataset): + """ + Basic experience source dataset. Takes a generate_batch function that returns an iterator. + The logic for the experience source and how the batch is generated is defined the Lightning model itself + """ + + def __init__(self, generate_batch: Callable): + self.generate_batch = generate_batch + + def __iter__(self) -> Iterable: + iterator = self.generate_batch() + return iterator + + +# Experience Sources +class BaseExperienceSource(ABC): + """ + Simplest form of the experience source + Args: + env: Environment that is being used + agent: Agent being used to make decisions + """ + + def __init__(self, env, agent) -> None: + self.env = env + self.agent = agent + + def runner(self) -> Experience: + """Iterable method that yields steps from the experience source""" + raise NotImplementedError("ExperienceSource has no stepper method implemented") + + +class ExperienceSource(BaseExperienceSource): + """ + Experience source class handling single and multiple environment steps + Args: + env: Environment that is being used + agent: Agent being used to make decisions + n_steps: Number of steps to return from each environment at once + """ + + def __init__(self, env, agent, n_steps: int = 1) -> None: + super().__init__(env, agent) + + self.pool = env if isinstance(env, (list, tuple)) else [env] + self.exp_history_queue = deque() + + self.n_steps = n_steps + self.total_steps = [] + self.states = [] + self.histories = [] + self.cur_rewards = [] + self.cur_steps = [] + self.iter_idx = 0 + + self._total_rewards = [] + + self.init_environments() + + def runner(self, device: torch.device) -> Tuple[Experience]: + """Experience Source iterator yielding Tuple of experiences for n_steps. These come from the pool + of environments provided by the user. + Args: + device: current device to be used for executing experience steps + Returns: + Tuple of Experiences + """ + while True: + # get actions for all envs + actions = self.env_actions(device) + + # step through each env + for env_idx, (env, action) in enumerate(zip(self.pool, actions)): + + exp = self.env_step(env_idx, env, action) + history = self.histories[env_idx] + history.append(exp) + self.states[env_idx] = exp.new_state + + self.update_history_queue(env_idx, exp, history) + + # Yield all accumulated history tuples to model + while self.exp_history_queue: + yield self.exp_history_queue.popleft() + + self.iter_idx += 1 + + def update_history_queue(self, env_idx, exp, history) -> None: + """ + Updates the experience history queue with the lastest experiences. In the event of an experience step is in + the done state, the history will be incrementally appended to the queue, removing the tail of the history + each time. + Args: + env_idx: index of the environment + exp: the current experience + history: history of experience steps for this environment + """ + # If there is a full history of step, append history to queue + if len(history) == self.n_steps: + self.exp_history_queue.append(tuple(history)) + + if exp.done: + if 0 < len(history) < self.n_steps: + self.exp_history_queue.append(tuple(history)) + + # generate tail of history, incrementally append history to queue + while len(history) > 2: + history.popleft() + self.exp_history_queue.append(tuple(history)) + + # when there are only 2 experiences left in the history, + # append to the queue then update the env stats and reset the environment + if len(history) > 1: + self.update_env_stats(env_idx) + + history.popleft() + self.exp_history_queue.append(tuple(history)) + + # Clear that last tail in the history once all others have been added to the queue + history.clear() + + def init_environments(self) -> None: + """ + For each environment in the pool setups lists for tracking history of size n, state, current reward and + current step + """ + for env in self.pool: + self.states.append(env.reset()) + self.histories.append(deque(maxlen=self.n_steps)) + self.cur_rewards.append(0.0) + self.cur_steps.append(0) + + def env_actions(self, device) -> List[List[int]]: + """ + For each environment in the pool, get the correct action + Returns: + List of actions for each env, with size (num_envs, action_size) + """ + actions = [] + states_actions = self.agent(self.states, device) + + assert len(self.states) == len(states_actions) + + for idx, action in enumerate(states_actions): + actions.append(action if isinstance(action, list) else [action]) + + return actions + + def env_step(self, env_idx: int, env: Env, action: List[int]) -> Experience: + """ + Carries out a step through the given environment using the given action + Args: + env_idx: index of the current environment + env: env at index env_idx + action: action for this environment step + Returns: + Experience tuple + """ + next_state, r, is_done, _ = env.step(action[0]) + + self.cur_rewards[env_idx] += r + self.cur_steps[env_idx] += 1 + + exp = Experience(state=self.states[env_idx], action=action[0], reward=r, done=is_done, new_state=next_state) + + return exp + + def update_env_stats(self, env_idx: int) -> None: + """ + To be called at the end of the history tail generation during the termination state. Updates the stats + tracked for all environments + Args: + env_idx: index of the environment used to update stats + """ + self._total_rewards.append(self.cur_rewards[env_idx]) + self.total_steps.append(self.cur_steps[env_idx]) + self.cur_rewards[env_idx] = 0 + self.cur_steps[env_idx] = 0 + self.states[env_idx] = self.pool[env_idx].reset() + + def pop_total_rewards(self) -> List[float]: + """ + Returns the list of the current total rewards collected + Returns: + list of total rewards for all completed episodes for each environment since last pop + """ + rewards = self._total_rewards + + if rewards: + self._total_rewards = [] + self.total_steps = [] + + return rewards + + def pop_rewards_steps(self): + """ + Returns the list of the current total rewards and steps collected + Returns: + list of total rewards and steps for all completed episodes for each environment since last pop + """ + res = list(zip(self._total_rewards, self.total_steps)) + if res: + self._total_rewards, self.total_steps = [], [] + return res + + +class DiscountedExperienceSource(ExperienceSource): + """Outputs experiences with a discounted reward over N steps""" + + def __init__(self, env: Env, agent, n_steps: int = 1, gamma: float = 0.99): + super().__init__(env, agent, (n_steps + 1)) + self.gamma = gamma + self.steps = n_steps + + def runner(self, device: torch.device) -> Experience: + """ + Iterates through experience tuple and calculate discounted experience + Args: + device: current device to be used for executing experience steps + Yields: + Discounted Experience + """ + for experiences in super().runner(device): + last_exp_state, tail_experiences = self.split_head_tail_exp(experiences) + + total_reward = self.discount_rewards(tail_experiences) + + yield Experience(state=experiences[0].state, action=experiences[0].action, + reward=total_reward, done=experiences[0].done, new_state=last_exp_state) + + def split_head_tail_exp(self, experiences: Tuple[Experience]) -> Tuple[List, Tuple[Experience]]: + """ + Takes in a tuple of experiences and returns the last state and tail experiences based on + if the last state is the end of an episode + Args: + experiences: Tuple of N Experience + Returns: + last state (Array or None) and remaining Experience + """ + if experiences[-1].done and len(experiences) <= self.steps: + last_exp_state = experiences[-1].new_state + tail_experiences = experiences + else: + last_exp_state = experiences[-1].state + tail_experiences = experiences[:-1] + return last_exp_state, tail_experiences + + def discount_rewards(self, experiences: Tuple[Experience]) -> float: + """ + Calculates the discounted reward over N experiences + Args: + experiences: Tuple of Experience + Returns: + total discounted reward + """ + total_reward = 0.0 + for exp in reversed(experiences): + total_reward = (self.gamma * total_reward) + exp.reward + return total_reward diff --git a/pl_bolts/datasets/__init__.py b/pl_bolts/datasets/__init__.py new file mode 100644 index 0000000000..e2d319ce2f --- /dev/null +++ b/pl_bolts/datasets/__init__.py @@ -0,0 +1,7 @@ +from pl_bolts.datasets.dummy_dataset import ( + RandomDictStringDataset, + RandomDictDataset, + RandomDataset, + DummyDataset, + DummyDetectionDataset +) diff --git a/pl_bolts/datasets/dummy_dataset.py b/pl_bolts/datasets/dummy_dataset.py new file mode 100644 index 0000000000..44b728422e --- /dev/null +++ b/pl_bolts/datasets/dummy_dataset.py @@ -0,0 +1,161 @@ +import torch +from torch.utils.data import Dataset, DataLoader + + +class DummyDataset(Dataset): + """ + Generate a dummy dataset + + Args: + *shapes: list of shapes + num_samples: how many samples to use in this dataset + + Example:: + + from pl_bolts.datasets import DummyDataset + + >>> # mnist dims + >>> ds = DummyDataset((1, 28, 28), (1, )) + >>> dl = DataLoader(ds, batch_size=7) + >>> # get first batch + >>> batch = next(iter(dl)) + >>> x, y = batch + >>> x.size() + torch.Size([7, 1, 28, 28]) + >>> y.size() + torch.Size([7, 1]) + """ + def __init__(self, *shapes, num_samples: int = 10000): + super().__init__() + self.shapes = shapes + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx: int): + sample = [] + for shape in self.shapes: + spl = torch.rand(*shape) + sample.append(spl) + return sample + + +class DummyDetectionDataset(Dataset): + """ + Generate a dummy dataset for detection + + Args: + *shapes: list of shapes + num_samples: how many samples to use in this dataset + + Example:: + + from pl_bolts.datasets import DummyDetectionDataset + + >>> ds = DummyDetectionDataset() + >>> dl = DataLoader(ds, batch_size=7) + """ + def __init__( + self, img_shape: tuple = (3, 256, 256), num_boxes: int = 1, num_classes: int = 2, num_samples: int = 10000 + ): + super().__init__() + self.img_shape = img_shape + self.num_samples = num_samples + self.num_boxes = num_boxes + self.num_classes = num_classes + + def __len__(self): + return self.num_samples + + def _random_bbox(self): + c, h, w = self.img_shape + xs = torch.randint(w, (2,)) + ys = torch.randint(h, (2,)) + return [min(xs), min(ys), max(xs), max(ys)] + + def __getitem__(self, idx: int): + img = torch.rand(self.img_shape) + boxes = torch.tensor([self._random_bbox() for _ in range(self.num_boxes)]) + labels = torch.randint(self.num_classes, (self.num_boxes,)) + return img, {"boxes": boxes, "labels": labels} + + +class RandomDictDataset(Dataset): + """ + Generate a dummy dataset with a dict structure + + Args: + size: tuple + num_samples: number of samples + + Example:: + + from pl_bolts.datasets import RandomDictDataset + + >>> ds = RandomDictDataset(10) + >>> dl = DataLoader(ds, batch_size=7) + """ + def __init__(self, size: int, num_samples: int = 250): + self.len = num_samples + self.data = torch.randn(num_samples, size) + + def __getitem__(self, index): + a = self.data[index] + b = a + 2 + return {'a': a, 'b': b} + + def __len__(self): + return self.len + + +class RandomDictStringDataset(Dataset): + """ + Generate a dummy dataset with strings + + Args: + size: tuple + num_samples: number of samples + + Example:: + + from pl_bolts.datasets import RandomDictStringDataset + + >>> ds = RandomDictStringDataset(10) + >>> dl = DataLoader(ds, batch_size=7) + """ + def __init__(self, size: int, num_samples: int = 250): + self.len = num_samples + self.data = torch.randn(num_samples, size) + + def __getitem__(self, index): + return {"id": str(index), "x": self.data[index]} + + def __len__(self): + return self.len + + +class RandomDataset(Dataset): + """ + Generate a dummy dataset + + Args: + size: tuple + num_samples: number of samples + + Example:: + + from pl_bolts.datasets import RandomDataset + + >>> ds = RandomDataset(10) + >>> dl = DataLoader(ds, batch_size=7) + """ + def __init__(self, size: int, num_samples: int = 250): + self.len = num_samples + self.data = torch.randn(num_samples, size) + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return self.len diff --git a/pl_bolts/losses/rl.py b/pl_bolts/losses/rl.py new file mode 100644 index 0000000000..a4a974f7c6 --- /dev/null +++ b/pl_bolts/losses/rl.py @@ -0,0 +1,118 @@ +""" +Loss functions for the RL models +""" + +from typing import Tuple, List + +import numpy as np +import torch +from torch import nn + + +def dqn_loss(batch: Tuple[torch.Tensor, torch.Tensor], net: nn.Module, + target_net: nn.Module, gamma: float = 0.99) -> torch.Tensor: + """ + Calculates the mse loss using a mini batch from the replay buffer + Args: + batch: current mini batch of replay data + net: main training network + target_net: target network of the main training network + gamma: discount factor + Returns: + loss + """ + states, actions, rewards, dones, next_states = batch + + actions = actions.long().squeeze(-1) + + state_action_values = ( + net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1) + ) + + with torch.no_grad(): + next_state_values = target_net(next_states).max(1)[0] + next_state_values[dones] = 0.0 + next_state_values = next_state_values.detach() + + expected_state_action_values = next_state_values * gamma + rewards + + return nn.MSELoss()(state_action_values, expected_state_action_values) + + +def double_dqn_loss(batch: Tuple[torch.Tensor, torch.Tensor], net: nn.Module, + target_net: nn.Module, gamma: float = 0.99) -> torch.Tensor: + """ + Calculates the mse loss using a mini batch from the replay buffer. This uses an improvement to the original + DQN loss by using the double dqn. This is shown by using the actions of the train network to pick the + value from the target network. This code is heavily commented in order to explain the process clearly + Args: + batch: current mini batch of replay data + net: main training network + target_net: target network of the main training network + gamma: discount factor + Returns: + loss + """ + states, actions, rewards, dones, next_states = batch # batch of experiences, batch_size = 16 + + actions = actions.long().squeeze(-1) + + state_action_values = ( + net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1) + ) + + # dont want to mess with gradients when using the target network + with torch.no_grad(): + next_outputs = net(next_states) # [16, 2], [batch, action_space] + + next_state_acts = next_outputs.max(1)[1].unsqueeze( + -1 + ) # take action at the index with the highest value + next_tgt_out = target_net(next_states) + + # Take the value of the action chosen by the train network + next_state_values = next_tgt_out.gather(1, next_state_acts).squeeze(-1) + next_state_values[dones] = 0.0 # any steps flagged as done get a 0 value + next_state_values = ( + next_state_values.detach() + ) # remove values from the graph, no grads needed + + # calc expected discounted return of next_state_values + expected_state_action_values = next_state_values * gamma + rewards + + # Standard MSE loss between the state action values of the current state and the + # expected state action values of the next state + return nn.MSELoss()(state_action_values, expected_state_action_values) + + +def per_dqn_loss(batch: Tuple[torch.Tensor, torch.Tensor], batch_weights: List, net: nn.Module, + target_net: nn.Module, gamma: float = 0.99) -> Tuple[torch.Tensor, np.ndarray]: + """ + Calculates the mse loss with the priority weights of the batch from the PER buffer + Args: + batch: current mini batch of replay data + batch_weights: how each of these samples are weighted in terms of priority + net: main training network + target_net: target network of the main training network + gamma: discount factor + Returns: + loss and batch_weights + """ + states, actions, rewards, dones, next_states = batch + + actions = actions.long() + + batch_weights = torch.tensor(batch_weights) + + actions_v = actions.unsqueeze(-1) + outputs = net(states) + state_action_vals = outputs.gather(1, actions_v) + state_action_vals = state_action_vals.squeeze(-1) + + with torch.no_grad(): + next_s_vals = target_net(next_states).max(1)[0] + next_s_vals[dones] = 0.0 + exp_sa_vals = next_s_vals.detach() * gamma + rewards + loss = (state_action_vals - exp_sa_vals) ** 2 + losses_v = batch_weights * loss + return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy() diff --git a/pl_bolts/models/autoencoders/basic_ae/basic_ae_module.py b/pl_bolts/models/autoencoders/basic_ae/basic_ae_module.py index d554769b7e..66c3a3a113 100644 --- a/pl_bolts/models/autoencoders/basic_ae/basic_ae_module.py +++ b/pl_bolts/models/autoencoders/basic_ae/basic_ae_module.py @@ -106,17 +106,15 @@ def step(self, batch, batch_idx): def training_step(self, batch, batch_idx): loss, logs = self.step(batch, batch_idx) - result = pl.TrainResult(minimize=loss) - result.log_dict( + self.log_dict( {f"train_{k}": v for k, v in logs.items()}, on_step=True, on_epoch=False ) - return result + return loss def validation_step(self, batch, batch_idx): loss, logs = self.step(batch, batch_idx) - result = pl.EvalResult(checkpoint_on=loss) - result.log_dict({f"val_{k}": v for k, v in logs.items()}) - return result + self.log_dict({f"val_{k}": v for k, v in logs.items()}) + return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.lr) diff --git a/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py b/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py index 40ba1be428..ab6671d000 100644 --- a/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py +++ b/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py @@ -139,17 +139,15 @@ def step(self, batch, batch_idx): def training_step(self, batch, batch_idx): loss, logs = self.step(batch, batch_idx) - result = pl.TrainResult(minimize=loss) - result.log_dict( + self.log_dict( {f"train_{k}": v for k, v in logs.items()}, on_step=True, on_epoch=False ) - return result + return loss def validation_step(self, batch, batch_idx): loss, logs = self.step(batch, batch_idx) - result = pl.EvalResult(checkpoint_on=loss) - result.log_dict({f"val_{k}": v for k, v in logs.items()}) - return result + self.log_dict({f"val_{k}": v for k, v in logs.items()}) + return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.lr) diff --git a/pl_bolts/models/gans/basic/basic_gan_module.py b/pl_bolts/models/gans/basic/basic_gan_module.py index 1459327daf..7311cb260b 100644 --- a/pl_bolts/models/gans/basic/basic_gan_module.py +++ b/pl_bolts/models/gans/basic/basic_gan_module.py @@ -136,18 +136,16 @@ def generator_step(self, x): # log to prog bar on each step AND for the full epoch # use the generator loss for checkpointing - result = pl.TrainResult(minimize=g_loss, checkpoint_on=g_loss) - result.log('g_loss', g_loss, on_epoch=True, prog_bar=True) - return result + self.log('g_loss', g_loss, on_epoch=True, prog_bar=True) + return g_loss def discriminator_step(self, x): # Measure discriminator's ability to classify real from generated samples d_loss = self.discriminator_loss(x) # log to prog bar on each step AND for the full epoch - result = pl.TrainResult(minimize=d_loss) - result.log('d_loss', d_loss, on_epoch=True, prog_bar=True) - return result + self.log('d_loss', d_loss, on_epoch=True, prog_bar=True) + return d_loss def configure_optimizers(self): lr = self.hparams.learning_rate diff --git a/pl_bolts/models/mnist_module.py b/pl_bolts/models/mnist_module.py index 365b481437..3dc71b22b1 100644 --- a/pl_bolts/models/mnist_module.py +++ b/pl_bolts/models/mnist_module.py @@ -36,43 +36,20 @@ def training_step(self, batch, batch_idx): x, y = batch y_hat = self(x) loss = F.cross_entropy(y_hat, y) - tensorboard_logs = {'train_loss': loss} - progress_bar_metrics = tensorboard_logs - return { - 'loss': loss, - 'log': tensorboard_logs, - 'progress_bar': progress_bar_metrics - } + self.log('train_loss', loss) + return loss def validation_step(self, batch, batch_idx): x, y = batch y_hat = self(x) - return {'val_loss': F.cross_entropy(y_hat, y)} - - def validation_epoch_end(self, outputs): - avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() - tensorboard_logs = {'val_loss': avg_loss} - progress_bar_metrics = tensorboard_logs - return { - 'val_loss': avg_loss, - 'log': tensorboard_logs, - 'progress_bar': progress_bar_metrics - } + loss = F.cross_entropy(y_hat, y) + self.log('val_loss', loss) def test_step(self, batch, batch_idx): x, y = batch y_hat = self(x) - return {'test_loss': F.cross_entropy(y_hat, y)} - - def test_epoch_end(self, outputs): - avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean() - tensorboard_logs = {'test_loss': avg_loss} - progress_bar_metrics = tensorboard_logs - return { - 'test_loss': avg_loss, - 'log': tensorboard_logs, - 'progress_bar': progress_bar_metrics - } + loss = F.cross_entropy(y_hat, y) + self.log('test_loss', loss) def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) diff --git a/pl_bolts/models/regression/logistic_regression.py b/pl_bolts/models/regression/logistic_regression.py index 650d12d4b0..b5df3f1c00 100644 --- a/pl_bolts/models/regression/logistic_regression.py +++ b/pl_bolts/models/regression/logistic_regression.py @@ -2,7 +2,7 @@ import pytorch_lightning as pl import torch -from pytorch_lightning.metrics.classification import accuracy +from pytorch_lightning.metrics.functional import accuracy from torch import nn from torch.nn import functional as F from torch.optim import Adam diff --git a/pl_bolts/models/rl/__init__.py b/pl_bolts/models/rl/__init__.py new file mode 100644 index 0000000000..cec3f871c8 --- /dev/null +++ b/pl_bolts/models/rl/__init__.py @@ -0,0 +1,10 @@ +try: + from pl_bolts.models.rl.double_dqn_model import DoubleDQN + from pl_bolts.models.rl.dqn_model import DQN + from pl_bolts.models.rl.dueling_dqn_model import DuelingDQN + from pl_bolts.models.rl.noisy_dqn_model import NoisyDQN + from pl_bolts.models.rl.per_dqn_model import PERDQN + from pl_bolts.models.rl.reinforce_model import Reinforce + from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient +except ModuleNotFoundError: + pass diff --git a/pl_bolts/models/rl/common/__init__.py b/pl_bolts/models/rl/common/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/pl_bolts/models/rl/common/agents.py b/pl_bolts/models/rl/common/agents.py new file mode 100644 index 0000000000..92c5fbb8fa --- /dev/null +++ b/pl_bolts/models/rl/common/agents.py @@ -0,0 +1,131 @@ +""" +Agent module containing classes for Agent logic +Based on the implementations found here: https://github.com/Shmuma/ptan/blob/master/ptan/agent.py +""" +from abc import ABC +from typing import List + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + + +class Agent(ABC): + """Basic agent that always returns 0""" + + def __init__(self, net: nn.Module): + self.net = net + + def __call__(self, state: torch.Tensor, device: str, *args, **kwargs) -> List[int]: + """ + Using the given network, decide what action to carry + Args: + state: current state of the environment + device: device used for current batch + Returns: + action + """ + return [0] + + +class ValueAgent(Agent): + """Value based agent that returns an action based on the Q values from the network""" + + def __init__( + self, + net: nn.Module, + action_space: int, + eps_start: float = 1.0, + eps_end: float = 0.2, + eps_frames: float = 1000, + ): + super().__init__(net) + self.action_space = action_space + self.eps_start = eps_start + self.epsilon = eps_start + self.eps_end = eps_end + self.eps_frames = eps_frames + + @torch.no_grad() + def __call__(self, state: torch.Tensor, device: str) -> List[int]: + """ + Takes in the current state and returns the action based on the agents policy + Args: + state: current state of the environment + device: the device used for the current batch + Returns: + action defined by policy + """ + if not isinstance(state, list): + state = [state] + + if np.random.random() < self.epsilon: + action = self.get_random_action(state) + else: + action = self.get_action(state, device) + + return action + + def get_random_action(self, state: torch.Tensor) -> int: + """returns a random action""" + actions = [] + + for i in range(len(state)): + action = np.random.randint(0, self.action_space) + actions.append(action) + + return actions + + def get_action(self, state: torch.Tensor, device: torch.device): + """ + Returns the best action based on the Q values of the network + Args: + state: current state of the environment + device: the device used for the current batch + Returns: + action defined by Q values + """ + if not isinstance(state, torch.Tensor): + state = torch.tensor(state, device=device) + + q_values = self.net(state) + _, actions = torch.max(q_values, dim=1) + return actions.detach().cpu().numpy() + + def update_epsilon(self, step: int) -> None: + """ + Updates the epsilon value based on the current step + Args: + step: current global step + """ + self.epsilon = max(self.eps_end, self.eps_start - (step + 1) / self.eps_frames) + + +class PolicyAgent(Agent): + """Policy based agent that returns an action based on the networks policy""" + + @torch.no_grad() + def __call__(self, states: torch.Tensor, device: str) -> List[int]: + """ + Takes in the current state and returns the action based on the agents policy + Args: + states: current state of the environment + device: the device used for the current batch + Returns: + action defined by policy + """ + if not isinstance(states, list): + states = [states] + + if not isinstance(states, torch.Tensor): + states = torch.tensor(states, device=device) + + # get the logits and pass through softmax for probability distribution + probabilities = F.softmax(self.net(states)).squeeze(dim=-1) + prob_np = probabilities.data.cpu().numpy() + + # take the numpy values and randomly select action based on prob distribution + actions = [np.random.choice(len(prob), p=prob) for prob in prob_np] + + return actions diff --git a/pl_bolts/models/rl/common/cli.py b/pl_bolts/models/rl/common/cli.py new file mode 100644 index 0000000000..a663c8acd8 --- /dev/null +++ b/pl_bolts/models/rl/common/cli.py @@ -0,0 +1,34 @@ +"""Contains generic arguments used for all models""" + +import argparse + + +def add_base_args(parent) -> argparse.ArgumentParser: + """ + Adds arguments for DQN model + + Note: these params are fine tuned for Pong env + + Args: + parent + """ + arg_parser = argparse.ArgumentParser(parents=[parent]) + + arg_parser.add_argument("--algo", type=str, default="dqn", help="algorithm to use for training") + arg_parser.add_argument("--batch_size", type=int, default=32, help="size of the batches") + arg_parser.add_argument("--lr", type=float, default=1e-4, help="learning rate") + + arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag") + arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") + + arg_parser.add_argument("--episode_length", type=int, default=500, help="max length of an episode") + arg_parser.add_argument("--max_episode_reward", type=int, default=18, help="max episode reward in the environment") + arg_parser.add_argument("--n_steps", type=int, default=4, help="how many steps to unroll for each update",) + arg_parser.add_argument("--seed", type=int, default=123, help="seed for training run") + arg_parser.add_argument("--epoch_len", type=int, default=1000, help="how many batches per epoch") + arg_parser.add_argument("--num_envs", type=int, default=1, help="number of environments to run at once") + arg_parser.add_argument("--avg_reward_len", type=int, default=100, + help="how many episodes to include in avg reward") + + arg_parser.add_argument("--seed", type=int, default=123, help="seed for training run") + return arg_parser diff --git a/pl_bolts/models/rl/common/gym_wrappers.py b/pl_bolts/models/rl/common/gym_wrappers.py new file mode 100644 index 0000000000..8f492a27c1 --- /dev/null +++ b/pl_bolts/models/rl/common/gym_wrappers.py @@ -0,0 +1,207 @@ +""" +Set of wrapper functions for gym environments taken from +https://github.com/Shmuma/ptan/blob/master/ptan/common/wrappers.py +""" +import collections +from warnings import warn + +import gym +import gym.spaces +import numpy as np +import torch +try: + import cv2 +except ModuleNotFoundError: + warn('You want to use `openCV` which is not installed yet,' # pragma: no-cover + ' install it with `pip install opencv-python`.') + _OPENCV_AVAILABLE = False +else: + _OPENCV_AVAILABLE = True + + +class ToTensor(gym.Wrapper): + """For environments where the user need to press FIRE for the game to start.""" + + def __init__(self, env=None): + super(ToTensor, self).__init__(env) + + def step(self, action): + """Take 1 step and cast to tensor""" + state, reward, done, info = self.env.step(action) + return torch.tensor(state), torch.tensor(reward), done, info + + def reset(self): + """reset the env and cast to tensor""" + return torch.tensor(self.env.reset()) + + +class FireResetEnv(gym.Wrapper): + """For environments where the user need to press FIRE for the game to start.""" + + def __init__(self, env=None): + super(FireResetEnv, self).__init__(env) + assert env.unwrapped.get_action_meanings()[1] == "FIRE" + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def step(self, action): + """Take 1 step""" + return self.env.step(action) + + def reset(self): + """reset the env""" + self.env.reset() + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset() + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset() + return obs + + +class MaxAndSkipEnv(gym.Wrapper): + """Return only every `skip`-th frame""" + + def __init__(self, env=None, skip=4): + super(MaxAndSkipEnv, self).__init__(env) + # most recent raw observations (for max pooling across time steps) + self._obs_buffer = collections.deque(maxlen=2) + self._skip = skip + + def step(self, action): + """take 1 step""" + total_reward = 0.0 + done = None + for _ in range(self._skip): + obs, reward, done, info = self.env.step(action) + self._obs_buffer.append(obs) + total_reward += reward + if done: + break + max_frame = np.max(np.stack(self._obs_buffer), axis=0) + return max_frame, total_reward, done, info + + def reset(self): + """Clear past frame buffer and init. to first obs. from inner env.""" + self._obs_buffer.clear() + obs = self.env.reset() + self._obs_buffer.append(obs) + return obs + + +class ProcessFrame84(gym.ObservationWrapper): + """preprocessing images from env""" + + def __init__(self, env=None): + + if not _OPENCV_AVAILABLE: + raise ModuleNotFoundError('This class uses OpenCV which it is not installed yet.') + + super(ProcessFrame84, self).__init__(env) + self.observation_space = gym.spaces.Box( + low=0, high=255, shape=(84, 84, 1), dtype=np.uint8 + ) + + def observation(self, obs): + """preprocess the obs""" + return ProcessFrame84.process(obs) + + @staticmethod + def process(frame): + """image preprocessing, formats to 84x84""" + if frame.size == 210 * 160 * 3: + img = np.reshape(frame, [210, 160, 3]).astype(np.float32) + elif frame.size == 250 * 160 * 3: + img = np.reshape(frame, [250, 160, 3]).astype(np.float32) + else: + assert False, "Unknown resolution." + img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 + resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA) + x_t = resized_screen[18:102, :] + x_t = np.reshape(x_t, [84, 84, 1]) + return x_t.astype(np.uint8) + + +class ImageToPyTorch(gym.ObservationWrapper): + """converts image to pytorch format""" + + def __init__(self, env): + + if not _OPENCV_AVAILABLE: + raise ModuleNotFoundError('This class uses OpenCV which it is not installed yet.') + + super(ImageToPyTorch, self).__init__(env) + old_shape = self.observation_space.shape + new_shape = (old_shape[-1], old_shape[0], old_shape[1]) + self.observation_space = gym.spaces.Box( + low=0.0, high=1.0, shape=new_shape, dtype=np.float32 + ) + + @staticmethod + def observation(observation): + """convert observation""" + return np.moveaxis(observation, 2, 0) + + +class ScaledFloatFrame(gym.ObservationWrapper): + """scales the pixels""" + + @staticmethod + def observation(obs): + return np.array(obs).astype(np.float32) / 255.0 + + +class BufferWrapper(gym.ObservationWrapper): + """"Wrapper for image stacking""" + + def __init__(self, env, n_steps, dtype=np.float32): + super(BufferWrapper, self).__init__(env) + self.dtype = dtype + self.buffer = None + old_space = env.observation_space + self.observation_space = gym.spaces.Box( + old_space.low.repeat(n_steps, axis=0), + old_space.high.repeat(n_steps, axis=0), + dtype=dtype, + ) + + def reset(self): + """reset env""" + self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype) + return self.observation(self.env.reset()) + + def observation(self, observation): + """convert observation""" + self.buffer[:-1] = self.buffer[1:] + self.buffer[-1] = observation + return self.buffer + + +class DataAugmentation(gym.ObservationWrapper): + """ + Carries out basic data augmentation on the env observations + - ToTensor + - GrayScale + - RandomCrop + """ + + def __init__(self, env=None): + super().__init__(env) + self.observation_space = gym.spaces.Box( + low=0, high=255, shape=(84, 84, 1), dtype=np.uint8 + ) + + def observation(self, obs): + """preprocess the obs""" + return ProcessFrame84.process(obs) + + +def make_environment(env_name): + """Convert environment with wrappers""" + env = gym.make(env_name) + env = MaxAndSkipEnv(env) + env = FireResetEnv(env) + env = ProcessFrame84(env) + env = ImageToPyTorch(env) + env = BufferWrapper(env, 4) + return ScaledFloatFrame(env) diff --git a/pl_bolts/models/rl/common/memory.py b/pl_bolts/models/rl/common/memory.py new file mode 100644 index 0000000000..0cd058ee43 --- /dev/null +++ b/pl_bolts/models/rl/common/memory.py @@ -0,0 +1,313 @@ +"""Series of memory buffers sued""" + +# Named tuple for storing experience steps gathered in training +import collections +from collections import deque, namedtuple +from typing import Tuple, List, Union + +import numpy as np + +Experience = namedtuple( + "Experience", field_names=["state", "action", "reward", "done", "new_state"] +) + + +class Buffer: + """ + Basic Buffer for storing a single experience at a time + Args: + capacity: size of the buffer + """ + + def __init__(self, capacity: int) -> None: + self.buffer = deque(maxlen=capacity) + + def __len__(self) -> None: + return len(self.buffer) + + def append(self, experience: Experience) -> None: + """ + Add experience to the buffer + Args: + experience: tuple (state, action, reward, done, new_state) + """ + self.buffer.append(experience) + + # pylint: disable=unused-argument + def sample(self, *args) -> Union[Tuple, List[Tuple]]: + """ + returns everything in the buffer so far it is then reset + Returns: + a batch of tuple np arrays of state, action, reward, done, next_state + """ + states, actions, rewards, dones, next_states = zip( + *[self.buffer[idx] for idx in range(self.__len__())] + ) + + self.buffer.clear() + + return ( + np.array(states), + np.array(actions), + np.array(rewards, dtype=np.float32), + np.array(dones, dtype=np.bool), + np.array(next_states), + ) + + +class ReplayBuffer(Buffer): + """ + Replay Buffer for storing past experiences allowing the agent to learn from them + """ + + def sample(self, batch_size: int) -> Tuple: + """ + Takes a sample of the buffer + Args: + batch_size: current batch_size + Returns: + a batch of tuple np arrays of state, action, reward, done, next_state + """ + + indices = np.random.choice(len(self.buffer), batch_size, replace=False) + states, actions, rewards, dones, next_states = zip( + *[self.buffer[idx] for idx in indices] + ) + + return ( + np.array(states), + np.array(actions), + np.array(rewards, dtype=np.float32), + np.array(dones, dtype=np.bool), + np.array(next_states), + ) + + +class MultiStepBuffer(ReplayBuffer): + """ + N Step Replay Buffer + + Args: + capacity: max number of experiences that will be stored in the buffer + n_steps: number of steps used for calculating discounted reward/experience + gamma: discount factor when calculating n_step discounted reward of the experience being stored in buffer + """ + + def __init__(self, capacity: int, n_steps: int = 1, gamma: float = 0.99) -> None: + super().__init__(capacity) + + self.n_steps = n_steps + self.gamma = gamma + self.history = deque(maxlen=self.n_steps) + self.exp_history_queue = deque() + + def append(self, exp: Experience) -> None: + """ + Add experience to the buffer + Args: + exp: tuple (state, action, reward, done, new_state) + """ + self.update_history_queue(exp) # add single step experience to history + while self.exp_history_queue: # go through all the n_steps that have been queued + experiences = self.exp_history_queue.popleft() # get the latest n_step experience from queue + + last_exp_state, tail_experiences = self.split_head_tail_exp(experiences) + + total_reward = self.discount_rewards(tail_experiences) + + n_step_exp = Experience(state=experiences[0].state, action=experiences[0].action, reward=total_reward, + done=experiences[0].done, new_state=last_exp_state) + + self.buffer.append(n_step_exp) # add n_step experience to buffer + + def update_history_queue(self, exp) -> None: + """ + Updates the experience history queue with the lastest experiences. In the event of an experience step is in + the done state, the history will be incrementally appended to the queue, removing the tail of the history + each time. + Args: + env_idx: index of the environment + exp: the current experience + history: history of experience steps for this environment + """ + self.history.append(exp) + + # If there is a full history of step, append history to queue + if len(self.history) == self.n_steps: + self.exp_history_queue.append(list(self.history)) + + if exp.done: + if 0 < len(self.history) < self.n_steps: + self.exp_history_queue.append(list(self.history)) + + # generate tail of history, incrementally append history to queue + while len(self.history) > 2: + self.history.popleft() + self.exp_history_queue.append(list(self.history)) + + # when there are only 2 experiences left in the history, + # append to the queue then update the env stats and reset the environment + if len(self.history) > 1: + self.history.popleft() + self.exp_history_queue.append(list(self.history)) + + # Clear that last tail in the history once all others have been added to the queue + self.history.clear() + + def split_head_tail_exp(self, experiences: Tuple[Experience]) -> Tuple[List, Tuple[Experience]]: + """ + Takes in a tuple of experiences and returns the last state and tail experiences based on + if the last state is the end of an episode + Args: + experiences: Tuple of N Experience + Returns: + last state (Array or None) and remaining Experience + """ + last_exp_state = experiences[-1].new_state + tail_experiences = experiences + + if experiences[-1].done and len(experiences) <= self.n_steps: + tail_experiences = experiences + + return last_exp_state, tail_experiences + + def discount_rewards(self, experiences: Tuple[Experience]) -> float: + """ + Calculates the discounted reward over N experiences + Args: + experiences: Tuple of Experience + Returns: + total discounted reward + """ + total_reward = 0.0 + for exp in reversed(experiences): + total_reward = (self.gamma * total_reward) + exp.reward + return total_reward + + +class MeanBuffer: + """ + Stores a deque of items and calculates the mean + """ + + def __init__(self, capacity): + self.capacity = capacity + self.deque = collections.deque(maxlen=capacity) + self.sum = 0.0 + + def add(self, val: float) -> None: + """Add to the buffer""" + if len(self.deque) == self.capacity: + self.sum -= self.deque[0] + self.deque.append(val) + self.sum += val + + def mean(self) -> float: + """Retrieve the mean""" + if not self.deque: + return 0.0 + return self.sum / len(self.deque) + + +class PERBuffer(ReplayBuffer): + """ + simple list based Prioritized Experience Replay Buffer + Based on implementation found here: + https://github.com/Shmuma/ptan/blob/master/ptan/experience.py#L371 + """ + + def __init__(self, buffer_size, prob_alpha=0.6, beta_start=0.4, beta_frames=100000): + super().__init__(capacity=buffer_size) + self.beta_start = beta_start + self.beta = beta_start + self.beta_frames = beta_frames + self.prob_alpha = prob_alpha + self.capacity = buffer_size + self.pos = 0 + self.buffer = [] + self.priorities = np.zeros((buffer_size,), dtype=np.float32) + + def update_beta(self, step) -> float: + """ + Update the beta value which accounts for the bias in the PER + Args: + step: current global step + Returns: + beta value for this indexed experience + """ + beta_val = self.beta_start + step * (1.0 - self.beta_start) / self.beta_frames + self.beta = min(1.0, beta_val) + + return self.beta + + def append(self, exp) -> None: + """ + Adds experiences from exp_source to the PER buffer + Args: + exp: experience tuple being added to the buffer + """ + # what is the max priority for new sample + max_prio = self.priorities.max() if self.buffer else 1.0 + + if len(self.buffer) < self.capacity: + self.buffer.append(exp) + else: + self.buffer[self.pos] = exp + + # the priority for the latest sample is set to max priority so it will be resampled soon + self.priorities[self.pos] = max_prio + + # update position, loop back if it reaches the end + self.pos = (self.pos + 1) % self.capacity + + def sample(self, batch_size=32) -> Tuple: + """ + Takes a prioritized sample from the buffer + Args: + batch_size: size of sample + Returns: + sample of experiences chosen with ranked probability + """ + # get list of priority rankings + if len(self.buffer) == self.capacity: + prios = self.priorities + else: + prios = self.priorities[: self.pos] + + # probability to the power of alpha to weight how important that probability it, 0 = normal distirbution + probs = prios ** self.prob_alpha + probs /= probs.sum() + + # choise sample of indices based on the priority prob distribution + indices = np.random.choice(len(self.buffer), batch_size, p=probs) + # samples = [self.buffer[idx] for idx in indices] + states, actions, rewards, dones, next_states = zip( + *[self.buffer[idx] for idx in indices] + ) + + samples = ( + np.array(states), + np.array(actions), + np.array(rewards, dtype=np.float32), + np.array(dones, dtype=np.bool), + np.array(next_states), + ) + total = len(self.buffer) + + # weight of each sample datum to compensate for the bias added in with prioritising samples + weights = (total * probs[indices]) ** (-self.beta) + weights /= weights.max() + + # return the samples, the indices chosen and the weight of each datum in the sample + return samples, indices, np.array(weights, dtype=np.float32) + + def update_priorities(self, batch_indices: List, batch_priorities: List) -> None: + """ + Update the priorities from the last batch, this should be called after the loss for this batch has been + calculated. + Args: + batch_indices: index of each datum in the batch + batch_priorities: priority of each datum in the batch + """ + for idx, prio in zip(batch_indices, batch_priorities): + self.priorities[idx] = prio diff --git a/pl_bolts/models/rl/common/networks.py b/pl_bolts/models/rl/common/networks.py new file mode 100644 index 0000000000..4776424d39 --- /dev/null +++ b/pl_bolts/models/rl/common/networks.py @@ -0,0 +1,317 @@ +"""Series of networks used +Based on implementations found here: +""" +import math +from typing import Tuple + +import numpy as np +import torch +from torch import Tensor +from torch import nn +from torch.nn import functional as F + + +class CNN(nn.Module): + """ + Simple MLP network + Args: + input_shape: observation shape of the environment + n_actions: number of discrete actions available in the environment + """ + + def __init__(self, input_shape, n_actions): + super(CNN, self).__init__() + + self.conv = nn.Sequential( + nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), + nn.ReLU(), + nn.Conv2d(32, 64, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(64, 64, kernel_size=3, stride=1), + nn.ReLU(), + ) + + conv_out_size = self._get_conv_out(input_shape) + self.head = nn.Sequential( + nn.Linear(conv_out_size, 512), nn.ReLU(), nn.Linear(512, n_actions) + ) + + def _get_conv_out(self, shape) -> int: + """ + Calculates the output size of the last conv layer + Args: + shape: input dimensions + Returns: + size of the conv output + """ + conv_out = self.conv(torch.zeros(1, *shape)) + return int(np.prod(conv_out.size())) + + def forward(self, input_x) -> Tensor: + """ + Forward pass through network + Args: + x: input to network + Returns: + output of network + """ + conv_out = self.conv(input_x).view(input_x.size()[0], -1) + return self.head(conv_out) + + +class MLP(nn.Module): + """ + Simple MLP network + Args: + input_shape: observation shape of the environment + n_actions: number of discrete actions available in the environment + hidden_size: size of hidden layers + """ + + def __init__(self, input_shape: Tuple, n_actions: int, hidden_size: int = 128): + super(MLP, self).__init__() + self.net = nn.Sequential( + nn.Linear(input_shape[0], hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, n_actions), + ) + + def forward(self, input_x): + """ + Forward pass through network + Args: + x: input to network + Returns: + output of network + """ + return self.net(input_x.float()) + + +class DuelingMLP(nn.Module): + """ + MLP network with duel heads for val and advantage + Args: + input_shape: observation shape of the environment + n_actions: number of discrete actions available in the environment + hidden_size: size of hidden layers + """ + + def __init__(self, input_shape: Tuple, n_actions: int, hidden_size: int = 128): + super(DuelingMLP, self).__init__() + + self.net = nn.Sequential( + nn.Linear(input_shape[0], hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + ) + + self.head_adv = nn.Sequential( + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, n_actions), + ) + self.head_val = nn.Sequential( + nn.Linear(hidden_size, 256), nn.ReLU(), nn.Linear(256, 1) + ) + + def forward(self, input_x): + """ + Forward pass through network. Calculates the Q using the value and advantage + Args: + x: input to network + Returns: + Q value + """ + adv, val = self.adv_val(input_x) + q_val = val + (adv - adv.mean(dim=1, keepdim=True)) + return q_val + + def adv_val(self, input_x) -> Tuple[Tensor, Tensor]: + """ + Gets the advantage and value by passing out of the base network through the + value and advantage heads + Args: + input_x: input to network + Returns: + advantage, value + """ + float_x = input_x.float() + base_out = self.net(float_x) + return self.fc_adv(base_out), self.fc_val(base_out) + + +class DuelingCNN(nn.Module): + """ + CNN network with duel heads for val and advantage + Args: + input_shape: observation shape of the environment + n_actions: number of discrete actions available in the environment + hidden_size: size of hidden layers + """ + + def __init__(self, input_shape: Tuple, n_actions: int, _: int = 128): + + super().__init__() + + self.conv = nn.Sequential( + nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), + nn.ReLU(), + nn.Conv2d(32, 64, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(64, 64, kernel_size=3, stride=1), + nn.ReLU(), + ) + + conv_out_size = self._get_conv_out(input_shape) + + # advantage head + self.head_adv = nn.Sequential( + nn.Linear(conv_out_size, 256), nn.ReLU(), nn.Linear(256, n_actions) + ) + + # value head + self.head_val = nn.Sequential( + nn.Linear(conv_out_size, 256), nn.ReLU(), nn.Linear(256, 1) + ) + + def _get_conv_out(self, shape) -> int: + """ + Calculates the output size of the last conv layer + Args: + shape: input dimensions + Returns: + size of the conv output + """ + conv_out = self.conv(torch.zeros(1, *shape)) + return int(np.prod(conv_out.size())) + + def forward(self, input_x): + """ + Forward pass through network. Calculates the Q using the value and advantage + Args: + input_x: input to network + Returns: + Q value + """ + adv, val = self.adv_val(input_x) + q_val = val + (adv - adv.mean(dim=1, keepdim=True)) + return q_val + + def adv_val(self, input_x): + """ + Gets the advantage and value by passing out of the base network through the + value and advantage heads + Args: + input_x: input to network + Returns: + advantage, value + """ + float_x = input_x.float() + base_out = self.conv(input_x).view(float_x.size()[0], -1) + return self.head_adv(base_out), self.head_val(base_out) + + +class NoisyCNN(nn.Module): + """ + CNN with Noisy Linear layers for exploration + Args: + input_shape: observation shape of the environment + n_actions: number of discrete actions available in the environment + """ + + def __init__(self, input_shape, n_actions): + super().__init__() + + self.conv = nn.Sequential( + nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), + nn.ReLU(), + nn.Conv2d(32, 64, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(64, 64, kernel_size=3, stride=1), + nn.ReLU(), + ) + + conv_out_size = self._get_conv_out(input_shape) + self.head = nn.Sequential( + NoisyLinear(conv_out_size, 512), nn.ReLU(), NoisyLinear(512, n_actions) + ) + + def _get_conv_out(self, shape) -> int: + """ + Calculates the output size of the last conv layer + Args: + shape: input dimensions + Returns: + size of the conv output + """ + conv_out = self.conv(torch.zeros(1, *shape)) + return int(np.prod(conv_out.size())) + + def forward(self, input_x) -> Tensor: + """ + Forward pass through network + Args: + x: input to network + Returns: + output of network + """ + conv_out = self.conv(input_x).view(input_x.size()[0], -1) + return self.head(conv_out) + + +################### +# Custom Layers # +################### + + +class NoisyLinear(nn.Linear): + """ + Noisy Layer using Independent Gaussian Noise. + based on https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/ + Chapter08/lib/dqn_extra.py#L19 + Args: + in_features: number of inputs + out_features: number of outputs + sigma_init: initial fill value of noisy weights + bias: flag to include bias to linear layer + """ + + def __init__(self, in_features, out_features, sigma_init=0.017, bias=True): + super(NoisyLinear, self).__init__(in_features, out_features, bias=bias) + + weights = torch.full((out_features, in_features), sigma_init) + self.sigma_weight = nn.Parameter(weights) + epsilon_weight = torch.zeros(out_features, in_features) + self.register_buffer("epsilon_weight", epsilon_weight) + + if bias: + bias = torch.full((out_features,), sigma_init) + self.sigma_bias = nn.Parameter(bias) + epsilon_bias = torch.zeros(out_features) + self.register_buffer("epsilon_bias", epsilon_bias) + + self.reset_parameters() + + def reset_parameters(self) -> None: + """initializes or resets the paramseter of the layer""" + std = math.sqrt(3 / self.in_features) + self.weight.data.uniform_(-std, std) + self.bias.data.uniform_(-std, std) + + def forward(self, input_x: Tensor) -> Tensor: + """ + Forward pass of the layer + Args: + input_x: input tensor + Returns: + output of the layer + """ + self.epsilon_weight.normal_() + bias = self.bias + if bias is not None: + self.epsilon_bias.normal_() + bias = bias + self.sigma_bias * self.epsilon_bias.data + + noisy_weights = self.sigma_weight * self.epsilon_weight.data + self.weight + + return F.linear(input_x, noisy_weights, bias) diff --git a/pl_bolts/models/rl/double_dqn_model.py b/pl_bolts/models/rl/double_dqn_model.py new file mode 100644 index 0000000000..f31ae16c6d --- /dev/null +++ b/pl_bolts/models/rl/double_dqn_model.py @@ -0,0 +1,123 @@ +""" +Double DQN +""" +import argparse +from collections import OrderedDict +from typing import Tuple + +import pytorch_lightning as pl +import torch + +from pl_bolts.losses.rl import double_dqn_loss +from pl_bolts.models.rl.dqn_model import DQN + + +class DoubleDQN(DQN): + """ + Double Deep Q-network (DDQN) + PyTorch Lightning implementation of `Double DQN `_ + + Paper authors: Hado van Hasselt, Arthur Guez, David Silver + + Model implemented by: + + - `Donal Byrne ` + + Example: + + >>> from pl_bolts.models.rl.double_dqn_model import DoubleDQN + ... + >>> model = DoubleDQN("PongNoFrameskip-v4") + + Train:: + + trainer = Trainer() + trainer.fit(model) + + Args: + env: gym environment tag + gpus: number of gpus being used + eps_start: starting value of epsilon for the epsilon-greedy exploration + eps_end: final value of epsilon for the epsilon-greedy exploration + eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end + sync_rate: the number of iterations between syncing up the target network with the train network + gamma: discount factor + lr: learning rate + batch_size: size of minibatch pulled from the DataLoader + replay_size: total capacity of the replay buffer + warm_start_size: how many random steps through the environment to be carried out at the start of + training to fill the buffer with a starting point + sample_len: the number of samples to pull from the dataset iterator and feed to the DataLoader + + Note: + This example is based on + https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter08/03_dqn_double.py + + Note: + Currently only supports CPU and single GPU training with `distributed_backend=dp` + """ + + def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: + """ + Carries out a single step through the environment to update the replay buffer. + Then calculates loss based on the minibatch recieved + Args: + batch: current mini batch of replay data + _: batch number, not used + Returns: + Training loss and log metrics + """ + + # calculates training loss + loss = double_dqn_loss(batch, self.net, self.target_net) + + if self.trainer.use_dp or self.trainer.use_ddp2: + loss = loss.unsqueeze(0) + + # Soft update of target network + if self.global_step % self.sync_rate == 0: + self.target_net.load_state_dict(self.net.state_dict()) + + log = { + "total_reward": self.total_rewards[-1], + "avg_reward": self.avg_rewards, + "train_loss": loss, + # "episodes": self.total_episode_steps, + } + status = { + "steps": self.global_step, + "avg_reward": self.avg_rewards, + "total_reward": self.total_rewards[-1], + "episodes": self.done_episodes, + # "episode_steps": self.episode_steps, + "epsilon": self.agent.epsilon, + } + + return OrderedDict( + { + "loss": loss, + "avg_reward": self.avg_rewards, + "log": log, + "progress_bar": status, + } + ) + + +def cli_main(): + parser = argparse.ArgumentParser(add_help=False) + + # trainer args + parser = pl.Trainer.add_argparse_args(parser) + + # model args + parser = DoubleDQN.add_model_specific_args(parser) + args = parser.parse_args() + + model = DoubleDQN(**args.__dict__) + + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model) + + +if __name__ == '__main__': + cli_main() diff --git a/pl_bolts/models/rl/dqn_model.py b/pl_bolts/models/rl/dqn_model.py new file mode 100644 index 0000000000..01b4a68277 --- /dev/null +++ b/pl_bolts/models/rl/dqn_model.py @@ -0,0 +1,443 @@ +""" +Deep Q Network +""" + +import argparse +from collections import OrderedDict +from typing import Tuple, List, Dict +from warnings import warn + +import numpy as np +import pytorch_lightning as pl +import torch +import torch.optim as optim +from pytorch_lightning import seed_everything +from pytorch_lightning.callbacks import ModelCheckpoint +from torch.optim.optimizer import Optimizer +from torch.utils.data import DataLoader + +from pl_bolts.datamodules.experience_source import ExperienceSourceDataset, Experience +from pl_bolts.losses.rl import dqn_loss +from pl_bolts.models.rl.common.agents import ValueAgent +from pl_bolts.models.rl.common.memory import MultiStepBuffer +from pl_bolts.models.rl.common.networks import CNN +try: + from pl_bolts.models.rl.common.gym_wrappers import gym, make_environment +except ModuleNotFoundError: + warn('You want to use `gym` which is not installed yet,' # pragma: no-cover + ' install it with `pip install gym`.') + _GYM_AVAILABLE = False +else: + _GYM_AVAILABLE = True + + +class DQN(pl.LightningModule): + """ Basic DQN Model """ + + def __init__( + self, + env: str, + eps_start: float = 1.0, + eps_end: float = 0.02, + eps_last_frame: int = 150000, + sync_rate: int = 1000, + gamma: float = 0.99, + learning_rate: float = 1e-4, + batch_size: int = 32, + replay_size: int = 100000, + warm_start_size: int = 10000, + avg_reward_len: int = 100, + min_episode_reward: int = -21, + seed: int = 123, + batches_per_epoch: int = 1000, + n_steps: int = 1, + **kwargs, + ): + """ + PyTorch Lightning implementation of `DQN `_ + Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, + Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. + Model implemented by: + + - `Donal Byrne ` + + Example: + >>> from pl_bolts.models.rl.dqn_model import DQN + ... + >>> model = DQN("PongNoFrameskip-v4") + + Train:: + + trainer = Trainer() + trainer.fit(model) + + Args: + env: gym environment tag + eps_start: starting value of epsilon for the epsilon-greedy exploration + eps_end: final value of epsilon for the epsilon-greedy exploration + eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end + sync_rate: the number of iterations between syncing up the target network with the train network + gamma: discount factor + learning_rate: learning rate + batch_size: size of minibatch pulled from the DataLoader + replay_size: total capacity of the replay buffer + warm_start_size: how many random steps through the environment to be carried out at the start of + training to fill the buffer with a starting point + avg_reward_len: how many episodes to take into account when calculating the avg reward + min_episode_reward: the minimum score that can be achieved in an episode. Used for filling the avg buffer + before training begins + seed: seed value for all RNG used + batches_per_epoch: number of batches per epoch + n_steps: size of n step look ahead + + Note: + This example is based on: + https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter06/02_dqn_pong.py + + Note: + Currently only supports CPU and single GPU training with `distributed_backend=dp` + """ + super().__init__() + + # Environment + self.exp = None + self.env = self.make_environment(env, seed) + self.test_env = self.make_environment(env) + + self.obs_shape = self.env.observation_space.shape + self.n_actions = self.env.action_space.n + + # Model Attributes + self.buffer = None + self.dataset = None + + self.net = None + self.target_net = None + self.build_networks() + + self.agent = ValueAgent( + self.net, + self.n_actions, + eps_start=eps_start, + eps_end=eps_end, + eps_frames=eps_last_frame, + ) + + # Hyperparameters + self.sync_rate = sync_rate + self.gamma = gamma + self.lr = learning_rate + self.batch_size = batch_size + self.replay_size = replay_size + self.warm_start_size = warm_start_size + self.batches_per_epoch = batches_per_epoch + self.n_steps = n_steps + + self.save_hyperparameters() + + # Metrics + self.total_episode_steps = [0] + self.total_rewards = [0] + self.done_episodes = 0 + self.total_steps = 0 + + # Average Rewards + self.avg_reward_len = avg_reward_len + + for _ in range(avg_reward_len): + self.total_rewards.append( + torch.tensor(min_episode_reward, device=self.device) + ) + + self.avg_rewards = float( + np.mean(self.total_rewards[-self.avg_reward_len:]) + ) + + self.state = self.env.reset() + + def run_n_episodes(self, env, n_epsiodes: int = 1, epsilon: float = 1.0) -> List[int]: + """ + Carries out N episodes of the environment with the current agent + Args: + env: environment to use, either train environment or test environment + n_epsiodes: number of episodes to run + epsilon: epsilon value for DQN agent + """ + total_rewards = [] + + for _ in range(n_epsiodes): + episode_state = env.reset() + done = False + episode_reward = 0 + + while not done: + self.agent.epsilon = epsilon + action = self.agent(episode_state, self.device) + next_state, reward, done, _ = self.env.step(action[0]) + episode_state = next_state + episode_reward += reward + + total_rewards.append(episode_reward) + + return total_rewards + + def populate(self, warm_start: int) -> None: + """Populates the buffer with initial experience""" + if warm_start > 0: + self.state = self.env.reset() + + for _ in range(warm_start): + self.agent.epsilon = 1.0 + action = self.agent(self.state, self.device) + next_state, reward, done, _ = self.env.step(action[0]) + exp = Experience(state=self.state, action=action[0], reward=reward, done=done, new_state=next_state) + self.buffer.append(exp) + self.state = next_state + + if done: + self.state = self.env.reset() + + def build_networks(self) -> None: + """Initializes the DQN train and target networks""" + self.net = CNN(self.obs_shape, self.n_actions) + self.target_net = CNN(self.obs_shape, self.n_actions) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Passes in a state x through the network and gets the q_values of each action as an output + Args: + x: environment state + Returns: + q values + """ + output = self.net(x) + return output + + def train_batch( + self, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Contains the logic for generating a new batch of data to be passed to the DataLoader + Returns: + yields a Experience tuple containing the state, action, reward, done and next_state. + """ + episode_reward = 0 + episode_steps = 0 + + while True: + self.total_steps += 1 + action = self.agent(self.state, self.device) + + next_state, r, is_done, _ = self.env.step(action[0]) + + episode_reward += r + episode_steps += 1 + + exp = Experience(state=self.state, action=action[0], reward=r, done=is_done, new_state=next_state) + + self.agent.update_epsilon(self.global_step) + self.buffer.append(exp) + self.state = next_state + + if is_done: + self.done_episodes += 1 + self.total_rewards.append(episode_reward) + self.total_episode_steps.append(episode_steps) + self.avg_rewards = float( + np.mean(self.total_rewards[-self.avg_reward_len:]) + ) + self.state = self.env.reset() + episode_steps = 0 + episode_reward = 0 + + states, actions, rewards, dones, new_states = self.buffer.sample(self.batch_size) + + for idx, _ in enumerate(dones): + yield states[idx], actions[idx], rewards[idx], dones[idx], new_states[idx] + + # Simulates epochs + if self.total_steps % self.batches_per_epoch == 0: + break + + def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: + """ + Carries out a single step through the environment to update the replay buffer. + Then calculates loss based on the minibatch recieved + Args: + batch: current mini batch of replay data + _: batch number, not used + Returns: + Training loss and log metrics + """ + + # calculates training loss + loss = dqn_loss(batch, self.net, self.target_net) + + if self.trainer.use_dp or self.trainer.use_ddp2: + loss = loss.unsqueeze(0) + + # Soft update of target network + if self.global_step % self.sync_rate == 0: + self.target_net.load_state_dict(self.net.state_dict()) + + log = { + "total_reward": self.total_rewards[-1], + "avg_reward": self.avg_rewards, + "train_loss": loss, + "episodes": self.done_episodes, + "episode_steps": self.total_episode_steps[-1] + } + status = { + "steps": self.global_step, + "avg_reward": self.avg_rewards, + "total_reward": self.total_rewards[-1], + "episodes": self.done_episodes, + "episode_steps": self.total_episode_steps[-1], + "epsilon": self.agent.epsilon, + } + + return OrderedDict( + { + "loss": loss, + "avg_reward": self.avg_rewards, + "log": log, + "progress_bar": status, + } + ) + + def test_step(self, *args, **kwargs) -> Dict[str, torch.Tensor]: + """Evaluate the agent for 10 episodes""" + test_reward = self.run_n_episodes(self.test_env, 1, 0) + avg_reward = sum(test_reward) / len(test_reward) + return {"test_reward": avg_reward} + + def test_epoch_end(self, outputs) -> Dict[str, torch.Tensor]: + """Log the avg of the test results""" + rewards = [x["test_reward"] for x in outputs] + avg_reward = sum(rewards) / len(rewards) + tensorboard_logs = {"avg_test_reward": avg_reward} + return {"avg_test_reward": avg_reward, "log": tensorboard_logs} + + def configure_optimizers(self) -> List[Optimizer]: + """ Initialize Adam optimizer""" + optimizer = optim.Adam(self.net.parameters(), lr=self.lr) + return [optimizer] + + def _dataloader(self) -> DataLoader: + """Initialize the Replay Buffer dataset used for retrieving experiences""" + self.buffer = MultiStepBuffer(self.replay_size, self.n_steps) + self.populate(self.warm_start_size) + + self.dataset = ExperienceSourceDataset(self.train_batch) + return DataLoader(dataset=self.dataset, batch_size=self.batch_size) + + def train_dataloader(self) -> DataLoader: + """Get train loader""" + return self._dataloader() + + def test_dataloader(self) -> DataLoader: + """Get test loader""" + return self._dataloader() + + @staticmethod + def make_environment(env_name: str, seed: int = None) -> gym.Env: + """ + Initialise gym environment + Args: + env_name: environment name or tag + seed: value to seed the environment RNG for reproducibility + Returns: + gym environment + """ + env = make_environment(env_name) + + if seed: + env.seed(seed) + + return env + + @staticmethod + def add_model_specific_args( + arg_parser: argparse.ArgumentParser, + ) -> argparse.ArgumentParser: + """ + Adds arguments for DQN model + Note: these params are fine tuned for Pong env + Args: + arg_parser: parent parser + """ + arg_parser.add_argument( + "--sync_rate", + type=int, + default=1000, + help="how many frames do we update the target network", + ) + arg_parser.add_argument( + "--replay_size", + type=int, + default=100000, + help="capacity of the replay buffer", + ) + arg_parser.add_argument( + "--warm_start_size", + type=int, + default=10000, + help="how many samples do we use to fill our buffer at the start of training", + ) + arg_parser.add_argument( + "--eps_last_frame", + type=int, + default=150000, + help="what frame should epsilon stop decaying", + ) + arg_parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon") + arg_parser.add_argument("--eps_end", type=float, default=0.02, help="final value of epsilon") + arg_parser.add_argument("--batches_per_epoch", type=int, default=10000, help="number of batches in an epoch") + arg_parser.add_argument("--batch_size", type=int, default=32, help="size of the batches") + arg_parser.add_argument("--lr", type=float, default=1e-4, help="learning rate") + + arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag") + arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") + + arg_parser.add_argument( + "--avg_reward_len", + type=int, + default=100, + help="how many episodes to include in avg reward", + ) + arg_parser.add_argument( + "--n_steps", + type=int, + default=1, + help="how many frames do we update the target network", + ) + + return arg_parser + + +def cli_main(): + parser = argparse.ArgumentParser(add_help=False) + + # trainer args + parser = pl.Trainer.add_argparse_args(parser) + + # model args + parser = DQN.add_model_specific_args(parser) + args = parser.parse_args() + + model = DQN(**args.__dict__) + + # save checkpoints based on avg_reward + checkpoint_callback = ModelCheckpoint( + save_top_k=1, monitor="avg_reward", mode="max", period=1, verbose=True + ) + + seed_everything(123) + trainer = pl.Trainer.from_argparse_args( + args, deterministic=True, checkpoint_callback=checkpoint_callback) + + trainer.fit(model) + + +if __name__ == '__main__': + cli_main() diff --git a/pl_bolts/models/rl/dueling_dqn_model.py b/pl_bolts/models/rl/dueling_dqn_model.py new file mode 100644 index 0000000000..79afca2fc7 --- /dev/null +++ b/pl_bolts/models/rl/dueling_dqn_model.py @@ -0,0 +1,75 @@ +""" +Dueling DQN +""" +import argparse + +import pytorch_lightning as pl + +from pl_bolts.models.rl.common.networks import DuelingCNN +from pl_bolts.models.rl.dqn_model import DQN + + +class DuelingDQN(DQN): + """ + PyTorch Lightning implementation of `Dueling DQN `_ + + Paper authors: Ziyu Wang, Tom Schaul, Matteo Hessel, Hado van Hasselt, Marc Lanctot, Nando de Freitas + + Model implemented by: + + - `Donal Byrne ` + + Example: + + >>> from pl_bolts.models.rl.dueling_dqn_model import DuelingDQN + ... + >>> model = DuelingDQN("PongNoFrameskip-v4") + + Train:: + + trainer = Trainer() + trainer.fit(model) + + Args: + env: gym environment tag + gpus: number of gpus being used + eps_start: starting value of epsilon for the epsilon-greedy exploration + eps_end: final value of epsilon for the epsilon-greedy exploration + eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end + sync_rate: the number of iterations between syncing up the target network with the train network + gamma: discount factor + lr: learning rate + batch_size: size of minibatch pulled from the DataLoader + replay_size: total capacity of the replay buffer + warm_start_size: how many random steps through the environment to be carried out at the start of + training to fill the buffer with a starting point + sample_len: the number of samples to pull from the dataset iterator and feed to the DataLoader + + .. note:: Currently only supports CPU and single GPU training with `distributed_backend=dp` + + """ + + def build_networks(self) -> None: + """Initializes the Dueling DQN train and target networks""" + self.net = DuelingCNN(self.obs_shape, self.n_actions) + self.target_net = DuelingCNN(self.obs_shape, self.n_actions) + + +def cli_main(): + parser = argparse.ArgumentParser(add_help=False) + + # trainer args + parser = pl.Trainer.add_argparse_args(parser) + + # model args + parser = DuelingDQN.add_model_specific_args(parser) + args = parser.parse_args() + + model = DuelingDQN(**args.__dict__) + + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model) + + +if __name__ == '__main__': + cli_main() diff --git a/pl_bolts/models/rl/noisy_dqn_model.py b/pl_bolts/models/rl/noisy_dqn_model.py new file mode 100644 index 0000000000..26f960c117 --- /dev/null +++ b/pl_bolts/models/rl/noisy_dqn_model.py @@ -0,0 +1,130 @@ +""" +Noisy DQN +""" +import argparse +from typing import Tuple + +import numpy as np +import pytorch_lightning as pl +import torch + +from pl_bolts.datamodules.experience_source import Experience +from pl_bolts.models.rl.common.networks import NoisyCNN +from pl_bolts.models.rl.dqn_model import DQN + + +class NoisyDQN(DQN): + """ + PyTorch Lightning implementation of `Noisy DQN `_ + + Paper authors: Meire Fortunato, Mohammad Gheshlaghi Azar, Bilal Piot, Jacob Menick, Ian Osband, Alex Graves, + Vlad Mnih, Remi Munos, Demis Hassabis, Olivier Pietquin, Charles Blundell, Shane Legg + + Model implemented by: + + - `Donal Byrne ` + + Example: + >>> from pl_bolts.models.rl.noisy_dqn_model import NoisyDQN + ... + >>> model = NoisyDQN("PongNoFrameskip-v4") + + Train:: + + trainer = Trainer() + trainer.fit(model) + + Args: + env: gym environment tag + gpus: number of gpus being used + eps_start: starting value of epsilon for the epsilon-greedy exploration + eps_end: final value of epsilon for the epsilon-greedy exploration + eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end + sync_rate: the number of iterations between syncing up the target network with the train network + gamma: discount factor + lr: learning rate + batch_size: size of minibatch pulled from the DataLoader + replay_size: total capacity of the replay buffer + warm_start_size: how many random steps through the environment to be carried out at the start of + training to fill the buffer with a starting point + sample_len: the number of samples to pull from the dataset iterator and feed to the DataLoader + + .. note:: Currently only supports CPU and single GPU training with `distributed_backend=dp` + + """ + + def build_networks(self) -> None: + """Initializes the Noisy DQN train and target networks""" + self.net = NoisyCNN(self.obs_shape, self.n_actions) + self.target_net = NoisyCNN(self.obs_shape, self.n_actions) + + def on_train_start(self) -> None: + """Set the agents epsilon to 0 as the exploration comes from the network""" + self.agent.epsilon = 0.0 + + def train_batch( + self, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Contains the logic for generating a new batch of data to be passed to the DataLoader. + This is the same function as the standard DQN except that we dont update epsilon as it is always 0. The + exploration comes from the noisy network. + Returns: + yields a Experience tuple containing the state, action, reward, done and next_state. + """ + episode_reward = 0 + episode_steps = 0 + + while True: + self.total_steps += 1 + action = self.agent(self.state, self.device) + + next_state, r, is_done, _ = self.env.step(action[0]) + + episode_reward += r + episode_steps += 1 + + exp = Experience(state=self.state, action=action[0], reward=r, done=is_done, new_state=next_state) + + self.buffer.append(exp) + self.state = next_state + + if is_done: + self.done_episodes += 1 + self.total_rewards.append(episode_reward) + self.total_episode_steps.append(episode_steps) + self.avg_rewards = float( + np.mean(self.total_rewards[-self.avg_reward_len:]) + ) + self.state = self.env.reset() + episode_steps = 0 + episode_reward = 0 + + states, actions, rewards, dones, new_states = self.buffer.sample(self.batch_size) + + for idx, _ in enumerate(dones): + yield states[idx], actions[idx], rewards[idx], dones[idx], new_states[idx] + + # Simulates epochs + if self.total_steps % self.batches_per_epoch == 0: + break + + +def cli_main(): + parser = argparse.ArgumentParser(add_help=False) + + # trainer args + parser = pl.Trainer.add_argparse_args(parser) + + # model args + parser = NoisyDQN.add_model_specific_args(parser) + args = parser.parse_args() + + model = NoisyDQN(**args.__dict__) + + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model) + + +if __name__ == '__main__': + cli_main() diff --git a/pl_bolts/models/rl/per_dqn_model.py b/pl_bolts/models/rl/per_dqn_model.py new file mode 100644 index 0000000000..07ad80d564 --- /dev/null +++ b/pl_bolts/models/rl/per_dqn_model.py @@ -0,0 +1,196 @@ +""" +Prioritized Experience Replay DQN +""" +import argparse +from collections import OrderedDict +from typing import Tuple + +import numpy as np +import pytorch_lightning as pl +import torch +from torch.utils.data import DataLoader + +from pl_bolts.datamodules import ExperienceSourceDataset +from pl_bolts.losses.rl import per_dqn_loss +from pl_bolts.models.rl.common.memory import PERBuffer, Experience +from pl_bolts.models.rl.dqn_model import DQN + + +class PERDQN(DQN): + """ + PyTorch Lightning implementation of `DQN With Prioritized Experience Replay `_ + + Paper authors: Tom Schaul, John Quan, Ioannis Antonoglou, David Silver + + Model implemented by: + + - `Donal Byrne ` + + Example: + + >>> from pl_bolts.models.rl.per_dqn_model import PERDQN + ... + >>> model = PERDQN("PongNoFrameskip-v4") + + Train:: + + trainer = Trainer() + trainer.fit(model) + + Args: + env: gym environment tag + gpus: number of gpus being used + eps_start: starting value of epsilon for the epsilon-greedy exploration + eps_end: final value of epsilon for the epsilon-greedy exploration + eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end + sync_rate: the number of iterations between syncing up the target network with the train network + gamma: discount factor + learning_rate: learning rate + batch_size: size of minibatch pulled from the DataLoader + replay_size: total capacity of the replay buffer + warm_start_size: how many random steps through the environment to be carried out at the start of + training to fill the buffer with a starting point + num_samples: the number of samples to pull from the dataset iterator and feed to the DataLoader + + .. note:: + This example is based on: + https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter08/05_dqn_prio_replay.py + + .. note:: Currently only supports CPU and single GPU training with `distributed_backend=dp` + + """ + + def train_batch( + self, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Contains the logic for generating a new batch of data to be passed to the DataLoader + Returns: + yields a Experience tuple containing the state, action, reward, done and next_state. + """ + + episode_reward = 0 + episode_steps = 0 + + while True: + self.total_steps += 1 + action = self.agent(self.state, self.device) + + next_state, r, is_done, _ = self.env.step(action[0]) + + episode_reward += r + episode_steps += 1 + + exp = Experience( + state=self.state, + action=action[0], + reward=r, + done=is_done, + new_state=next_state, + ) + + self.agent.update_epsilon(self.global_step) + self.buffer.append(exp) + self.state = next_state + + if is_done: + self.done_episodes += 1 + self.total_rewards.append(episode_reward) + self.total_episode_steps.append(episode_steps) + self.avg_rewards = float( + np.mean(self.total_rewards[-self.avg_reward_len:]) + ) + self.state = self.env.reset() + episode_steps = 0 + episode_reward = 0 + + samples, indices, weights = self.buffer.sample(self.batch_size) + + states, actions, rewards, dones, new_states = samples + + for idx, _ in enumerate(dones): + yield ( + states[idx], + actions[idx], + rewards[idx], + dones[idx], + new_states[idx], + ), indices[idx], weights[idx] + + def training_step(self, batch, _) -> OrderedDict: + """ + Carries out a single step through the environment to update the replay buffer. + Then calculates loss based on the minibatch recieved + Args: + batch: current mini batch of replay data + _: batch number, not used + Returns: + Training loss and log metrics + """ + samples, indices, weights = batch + indices = indices.cpu().numpy() + + # calculates training loss + loss, batch_weights = per_dqn_loss(samples, weights, self.net, self.target_net) + + if self.trainer.use_dp or self.trainer.use_ddp2: + loss = loss.unsqueeze(0) + + # update priorities in buffer + self.buffer.update_priorities(indices, batch_weights) + + # update of target network + if self.global_step % self.sync_rate == 0: + self.target_net.load_state_dict(self.net.state_dict()) + + log = { + "total_reward": self.total_rewards[-1], + "avg_reward": self.avg_rewards, + "train_loss": loss, + # "episodes": self.total_episode_steps, + } + status = { + "steps": self.global_step, + "avg_reward": self.avg_rewards, + "total_reward": self.total_rewards[-1], + "episodes": self.done_episodes, + # "episode_steps": self.episode_steps, + "epsilon": self.agent.epsilon, + } + + return OrderedDict( + { + "loss": loss, + "avg_reward": self.avg_rewards, + "log": log, + "progress_bar": status, + } + ) + + def _dataloader(self) -> DataLoader: + """Initialize the Replay Buffer dataset used for retrieving experiences""" + self.buffer = PERBuffer(self.replay_size) + self.populate(self.warm_start_size) + + self.dataset = ExperienceSourceDataset(self.train_batch) + return DataLoader(dataset=self.dataset, batch_size=self.batch_size) + + +def cli_main(): + parser = argparse.ArgumentParser(add_help=False) + + # trainer args + parser = pl.Trainer.add_argparse_args(parser) + + # model args + parser = PERDQN.add_model_specific_args(parser) + args = parser.parse_args() + + model = PERDQN(**args.__dict__) + + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model) + + +if __name__ == "__main__": + cli_main() diff --git a/pl_bolts/models/rl/reinforce_model.py b/pl_bolts/models/rl/reinforce_model.py new file mode 100644 index 0000000000..55535a91e7 --- /dev/null +++ b/pl_bolts/models/rl/reinforce_model.py @@ -0,0 +1,318 @@ +import argparse +from collections import OrderedDict +from typing import Tuple, List +from warnings import warn + +import numpy as np +import pytorch_lightning as pl +import torch +import torch.optim as optim +from pytorch_lightning import seed_everything +from pytorch_lightning.callbacks import ModelCheckpoint +from torch.nn.functional import log_softmax +from torch.optim.optimizer import Optimizer +from torch.utils.data import DataLoader + +from pl_bolts.datamodules import ExperienceSourceDataset +from pl_bolts.datamodules.experience_source import Experience +from pl_bolts.models.rl.common.agents import PolicyAgent +from pl_bolts.models.rl.common.networks import MLP +try: + import gym +except ModuleNotFoundError: + warn('You want to use `gym` which is not installed yet, install it with `pip install gym`.') # pragma: no-cover + _GYM_AVAILABLE = False +else: + _GYM_AVAILABLE = True + + +class Reinforce(pl.LightningModule): + def __init__( + self, + env: str, + gamma: float = 0.99, + lr: float = 0.01, + batch_size: int = 8, + n_steps: int = 10, + avg_reward_len: int = 100, + entropy_beta: float = 0.01, + epoch_len: int = 1000, + num_batch_episodes: int = 4, + **kwargs + ) -> None: + """ + PyTorch Lightning implementation of `REINFORCE + `_ + Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour + Model implemented by: + + - `Donal Byrne ` + + Example: + >>> from pl_bolts.models.rl.reinforce_model import Reinforce + ... + >>> model = Reinforce("CartPole-v0") + + Train:: + + trainer = Trainer() + trainer.fit(model) + + Args: + env: gym environment tag + gamma: discount factor + lr: learning rate + batch_size: size of minibatch pulled from the DataLoader + n_steps: number of stakes per discounted experience + entropy_beta: entropy coefficient + epoch_len: how many batches before pseudo epoch + num_batch_episodes: how many episodes to rollout for each batch of training + avg_reward_len: how many episodes to take into account when calculating the avg reward + + Note: + This example is based on: + https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/02_cartpole_reinforce.py + + Note: + Currently only supports CPU and single GPU training with `distributed_backend=dp` + """ + super().__init__() + + if not _GYM_AVAILABLE: + raise ModuleNotFoundError('This Module requires gym environment which is not installed yet.') + + # Hyperparameters + self.lr = lr + self.batch_size = batch_size + self.batches_per_epoch = self.batch_size * epoch_len + self.entropy_beta = entropy_beta + self.gamma = gamma + self.n_steps = n_steps + self.num_batch_episodes = num_batch_episodes + + self.save_hyperparameters() + + # Model components + self.env = gym.make(env) + self.net = MLP(self.env.observation_space.shape, self.env.action_space.n) + self.agent = PolicyAgent(self.net) + + # Tracking metrics + self.total_steps = 0 + self.total_rewards = [0] + self.done_episodes = 0 + self.avg_rewards = 0 + self.reward_sum = 0.0 + self.batch_episodes = 0 + self.avg_reward_len = avg_reward_len + + self.batch_states = [] + self.batch_actions = [] + self.batch_qvals = [] + self.cur_rewards = [] + + self.state = self.env.reset() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Passes in a state x through the network and gets the q_values of each action as an output + Args: + x: environment state + Returns: + q values + """ + output = self.net(x) + return output + + def calc_qvals(self, rewards: List[float]) -> List[float]: + """Calculate the discounted rewards of all rewards in list + Args: + rewards: list of rewards from latest batch + Returns: + list of discounted rewards + """ + assert isinstance(rewards[0], float) + + cumul_reward = [] + sum_r = 0.0 + + for r in reversed(rewards): + sum_r = (sum_r * self.gamma) + r + cumul_reward.append(sum_r) + + return list(reversed(cumul_reward)) + + def discount_rewards(self, experiences: Tuple[Experience]) -> float: + """ + Calculates the discounted reward over N experiences + Args: + experiences: Tuple of Experience + Returns: + total discounted reward + """ + total_reward = 0.0 + for exp in reversed(experiences): + total_reward = (self.gamma * total_reward) + exp.reward + return total_reward + + def train_batch( + self, + ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]: + """ + Contains the logic for generating a new batch of data to be passed to the DataLoader + Yield: + yields a tuple of Lists containing tensors for states, actions and rewards of the batch. + """ + + while True: + + action = self.agent(self.state, self.device) + + next_state, reward, done, _ = self.env.step(action[0]) + + self.batch_states.append(self.state) + self.batch_actions.append(action[0]) + self.cur_rewards.append(reward) + + self.state = next_state + self.total_steps += 1 + + if done: + self.batch_qvals.extend(self.calc_qvals(self.cur_rewards)) + self.batch_episodes += 1 + self.done_episodes += 1 + self.total_rewards.append(sum(self.cur_rewards)) + self.avg_rewards = float( + np.mean(self.total_rewards[-self.avg_reward_len:]) + ) + self.cur_rewards = [] + self.state = self.env.reset() + + if self.batch_episodes >= self.num_batch_episodes: + for state, action, qval in zip( + self.batch_states, self.batch_actions, self.batch_qvals + ): + yield state, action, qval + + self.batch_episodes = 0 + + # Simulates epochs + if self.total_steps % self.batches_per_epoch == 0: + break + + def loss(self, states, actions, scaled_rewards) -> torch.Tensor: + logits = self.net(states) + + # policy loss + log_prob = log_softmax(logits, dim=1) + log_prob_actions = scaled_rewards * log_prob[range(self.batch_size), actions] + loss = -log_prob_actions.mean() + + return loss + + def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: + """ + Carries out a single step through the environment to update the replay buffer. + Then calculates loss based on the minibatch recieved + Args: + batch: current mini batch of replay data + _: batch number, not used + Returns: + Training loss and log metrics + """ + states, actions, scaled_rewards = batch + + loss = self.loss(states, actions, scaled_rewards) + + log = { + "episodes": self.done_episodes, + "reward": self.total_rewards[-1], + "avg_reward": self.avg_rewards, + } + + return OrderedDict( + { + "loss": loss, + "avg_reward": self.avg_rewards, + "log": log, + "progress_bar": log, + } + ) + + def configure_optimizers(self) -> List[Optimizer]: + """ Initialize Adam optimizer""" + optimizer = optim.Adam(self.net.parameters(), lr=self.lr) + return [optimizer] + + def _dataloader(self) -> DataLoader: + """Initialize the Replay Buffer dataset used for retrieving experiences""" + dataset = ExperienceSourceDataset(self.train_batch) + dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size) + return dataloader + + def train_dataloader(self) -> DataLoader: + """Get train loader""" + return self._dataloader() + + def get_device(self, batch) -> str: + """Retrieve device currently being used by minibatch""" + return batch[0][0][0].device.index if self.on_gpu else "cpu" + + @staticmethod + def add_model_specific_args(arg_parser) -> argparse.ArgumentParser: + """ + Adds arguments for DQN model + Note: these params are fine tuned for Pong env + Args: + arg_parser: the current argument parser to add to + Returns: + arg_parser with model specific cargs added + """ + arg_parser.add_argument("--batches_per_epoch", type=int, default=10000, help="number of batches in an epoch") + arg_parser.add_argument("--batch_size", type=int, default=32, help="size of the batches") + arg_parser.add_argument("--lr", type=float, default=1e-3, help="learning rate") + + arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag") + arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") + + arg_parser.add_argument( + "--avg_reward_len", + type=int, + default=100, + help="how many episodes to include in avg reward", + ) + + arg_parser.add_argument( + "--entropy_beta", type=float, default=0.01, help="entropy value", + ) + + return arg_parser + + +def cli_main(): + parser = argparse.ArgumentParser(add_help=False) + + # trainer args + parser = pl.Trainer.add_argparse_args(parser) + + # model args + parser = Reinforce.add_model_specific_args(parser) + args = parser.parse_args() + + model = Reinforce(**args.__dict__) + + # save checkpoints based on avg_reward + checkpoint_callback = ModelCheckpoint( + save_top_k=1, monitor="avg_reward", mode="max", period=1, verbose=True + ) + + seed_everything(123) + trainer = pl.Trainer.from_argparse_args( + args, deterministic=True, checkpoint_callback=checkpoint_callback + ) + trainer.fit(model) + + +if __name__ == '__main__': + cli_main() diff --git a/pl_bolts/models/rl/vanilla_policy_gradient_model.py b/pl_bolts/models/rl/vanilla_policy_gradient_model.py new file mode 100644 index 0000000000..f7d9e6586f --- /dev/null +++ b/pl_bolts/models/rl/vanilla_policy_gradient_model.py @@ -0,0 +1,306 @@ +import argparse +from collections import OrderedDict +from typing import Tuple, List +from warnings import warn + +import numpy as np +import pytorch_lightning as pl +import torch +import torch.optim as optim +from pytorch_lightning import seed_everything +from pytorch_lightning.callbacks import ModelCheckpoint +from torch.nn.functional import log_softmax, softmax +from torch.optim.optimizer import Optimizer +from torch.utils.data import DataLoader + +from pl_bolts.datamodules import ExperienceSourceDataset +from pl_bolts.models.rl.common.agents import PolicyAgent +from pl_bolts.models.rl.common.networks import MLP +try: + import gym +except ModuleNotFoundError: + warn('You want to use `gym` which is not installed yet, install it with `pip install gym`.') # pragma: no-cover + _GYM_AVAILABLE = False +else: + _GYM_AVAILABLE = True + + +class VanillaPolicyGradient(pl.LightningModule): + def __init__( + self, + env: str, + gamma: float = 0.99, + lr: float = 0.01, + batch_size: int = 8, + n_steps: int = 10, + avg_reward_len: int = 100, + entropy_beta: float = 0.01, + epoch_len: int = 1000, + **kwargs + ) -> None: + """ + PyTorch Lightning implementation of `Vanilla Policy Gradient + `_ + Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour + Model implemented by: + + - `Donal Byrne ` + + Example: + >>> from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient + ... + >>> model = VanillaPolicyGradient("CartPole-v0") + + Train:: + trainer = Trainer() + trainer.fit(model) + + Args: + env: gym environment tag + gamma: discount factor + lr: learning rate + batch_size: size of minibatch pulled from the DataLoader + batch_episodes: how many episodes to rollout for each batch of training + entropy_beta: dictates the level of entropy per batch + avg_reward_len: how many episodes to take into account when calculating the avg reward + + Note: + This example is based on: + https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/04_cartpole_pg.py + + Note: + Currently only supports CPU and single GPU training with `distributed_backend=dp` + """ + super().__init__() + + if not _GYM_AVAILABLE: + raise ModuleNotFoundError('This Module requires gym environment which is not installed yet.') + + # Hyperparameters + self.lr = lr + self.batch_size = batch_size + self.batches_per_epoch = self.batch_size * epoch_len + self.entropy_beta = entropy_beta + self.gamma = gamma + self.n_steps = n_steps + + self.save_hyperparameters() + + # Model components + self.env = gym.make(env) + self.net = MLP(self.env.observation_space.shape, self.env.action_space.n) + self.agent = PolicyAgent(self.net) + + # Tracking metrics + self.total_rewards = [] + self.episode_rewards = [] + self.done_episodes = 0 + self.avg_rewards = 0 + self.avg_reward_len = avg_reward_len + self.eps = np.finfo(np.float32).eps.item() + self.batch_states = [] + self.batch_actions = [] + + self.state = self.env.reset() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Passes in a state x through the network and gets the q_values of each action as an output + Args: + x: environment state + Returns: + q values + """ + output = self.net(x) + return output + + def train_batch( + self, + ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]: + """ + Contains the logic for generating a new batch of data to be passed to the DataLoader + Returns: + yields a tuple of Lists containing tensors for states, actions and rewards of the batch. + """ + + while True: + + action = self.agent(self.state, self.device) + + next_state, reward, done, _ = self.env.step(action[0]) + + self.episode_rewards.append(reward) + self.batch_actions.append(action) + self.batch_states.append(self.state) + self.state = next_state + + if done: + self.done_episodes += 1 + self.state = self.env.reset() + self.total_rewards.append(sum(self.episode_rewards)) + self.avg_rewards = float(np.mean(self.total_rewards[-self.avg_reward_len:])) + + returns = self.compute_returns(self.episode_rewards) + + for idx in range(len(self.batch_actions)): + yield self.batch_states[idx], self.batch_actions[idx], returns[idx] + + self.batch_states = [] + self.batch_actions = [] + self.episode_rewards = [] + + def compute_returns(self, rewards): + """ + Calculate the discounted rewards of the batched rewards + + Args: + rewards: list of batched rewards + + Returns: + list of discounted rewards + """ + reward = 0 + returns = [] + + for r in rewards[::-1]: + reward = r + self.gamma * reward + returns.insert(0, reward) + + returns = torch.tensor(returns) + returns = (returns - returns.mean()) / (returns.std() + self.eps) + + return returns + + def loss(self, states, actions, scaled_rewards) -> torch.Tensor: + """ + Calculates the loss for VPG + + Args: + states: batched states + actions: batch actions + scaled_rewards: batche Q values + + Returns: + loss for the current batch + """ + + logits = self.net(states) + + # policy loss + log_prob = log_softmax(logits, dim=1) + log_prob_actions = scaled_rewards * log_prob[range(self.batch_size), actions[0]] + policy_loss = -log_prob_actions.mean() + + # entropy loss + prob = softmax(logits, dim=1) + entropy = -(prob * log_prob).sum(dim=1).mean() + entropy_loss = -self.entropy_beta * entropy + + # total loss + loss = policy_loss + entropy_loss + + return loss + + def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: + """ + Carries out a single step through the environment to update the replay buffer. + Then calculates loss based on the minibatch recieved + Args: + batch: current mini batch of replay data + _: batch number, not used + Returns: + Training loss and log metrics + """ + states, actions, scaled_rewards = batch + + loss = self.loss(states, actions, scaled_rewards) + + log = { + "episodes": self.done_episodes, + "reward": self.total_rewards[-1], + "avg_reward": self.avg_rewards, + } + return OrderedDict( + { + "loss": loss, + "avg_reward": self.avg_rewards, + "log": log, + "progress_bar": log, + } + ) + + def configure_optimizers(self) -> List[Optimizer]: + """ Initialize Adam optimizer""" + optimizer = optim.Adam(self.net.parameters(), lr=self.lr) + return [optimizer] + + def _dataloader(self) -> DataLoader: + """Initialize the Replay Buffer dataset used for retrieving experiences""" + dataset = ExperienceSourceDataset(self.train_batch) + dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size) + return dataloader + + def train_dataloader(self) -> DataLoader: + """Get train loader""" + return self._dataloader() + + def get_device(self, batch) -> str: + """Retrieve device currently being used by minibatch""" + return batch[0][0][0].device.index if self.on_gpu else "cpu" + + @staticmethod + def add_model_specific_args(arg_parser) -> argparse.ArgumentParser: + """ + Adds arguments for DQN model + Note: these params are fine tuned for Pong env + Args: + arg_parser: the current argument parser to add to + Returns: + arg_parser with model specific cargs added + """ + + arg_parser.add_argument("--entropy_beta", type=float, default=0.01, help="entropy value") + arg_parser.add_argument("--batches_per_epoch", type=int, default=10000, help="number of batches in an epoch") + arg_parser.add_argument("--batch_size", type=int, default=32, help="size of the batches") + arg_parser.add_argument("--lr", type=float, default=1e-3, help="learning rate") + arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag") + arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") + arg_parser.add_argument("--seed", type=int, default=123, help="seed for training run") + + arg_parser.add_argument( + "--avg_reward_len", + type=int, + default=100, + help="how many episodes to include in avg reward", + ) + + return arg_parser + + +def cli_main(): + parser = argparse.ArgumentParser(add_help=False) + + # trainer args + parser = pl.Trainer.add_argparse_args(parser) + + # model args + parser = VanillaPolicyGradient.add_model_specific_args(parser) + args = parser.parse_args() + + model = VanillaPolicyGradient(**args.__dict__) + + # save checkpoints based on avg_reward + checkpoint_callback = ModelCheckpoint( + save_top_k=1, monitor="avg_reward", mode="max", period=1, verbose=True + ) + + seed_everything(123) + trainer = pl.Trainer.from_argparse_args( + args, deterministic=True, checkpoint_callback=checkpoint_callback + ) + trainer.fit(model) + + +if __name__ == '__main__': + cli_main() diff --git a/pl_bolts/models/self_supervised/byol/byol_module.py b/pl_bolts/models/self_supervised/byol/byol_module.py index b6fb83cf52..95c68bbee7 100644 --- a/pl_bolts/models/self_supervised/byol/byol_module.py +++ b/pl_bolts/models/self_supervised/byol/byol_module.py @@ -29,7 +29,7 @@ def __init__(self, PyTorch Lightning implementation of `Bootstrap Your Own Latent (BYOL) `_ - Paper authors: Jean-Bastien Grill ,Florian Strub, Florent Altché, Corentin Tallec, Pierre H. Richemond, \ + Paper authors: Jean-Bastien Grill, Florian Strub, Florent Altché, Corentin Tallec, Pierre H. Richemond, \ Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Daniel Guo, Mohammad Gheshlaghi Azar, \ Bilal Piot, Koray Kavukcuoglu, Rémi Munos, Michal Valko. @@ -136,19 +136,17 @@ def training_step(self, batch, batch_idx): loss_a, loss_b, total_loss = self.shared_step(batch, batch_idx) # log results - result = pl.TrainResult(minimize=total_loss) - result.log_dict({'1_2_loss': loss_a, '2_1_loss': loss_b, 'train_loss': total_loss}) + self.log_dict({'1_2_loss': loss_a, '2_1_loss': loss_b, 'train_loss': total_loss}) - return result + return total_loss def validation_step(self, batch, batch_idx): loss_a, loss_b, total_loss = self.shared_step(batch, batch_idx) # log results - result = pl.EvalResult(early_stop_on=total_loss, checkpoint_on=total_loss) - result.log_dict({'1_2_loss': loss_a, '2_1_loss': loss_b, 'train_loss': total_loss}) + self.log_dict({'1_2_loss': loss_a, '2_1_loss': loss_b, 'train_loss': total_loss}) - return result + return total_loss def configure_optimizers(self): optimizer = Adam(self.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay) diff --git a/pl_bolts/models/self_supervised/cpc/cpc_module.py b/pl_bolts/models/self_supervised/cpc/cpc_module.py index f4eff40b00..4cdaa74025 100644 --- a/pl_bolts/models/self_supervised/cpc/cpc_module.py +++ b/pl_bolts/models/self_supervised/cpc/cpc_module.py @@ -35,7 +35,7 @@ class CPCV2(pl.LightningModule): def __init__( self, datamodule: pl.LightningDataModule = None, - encoder: Union[str, torch.nn.Module, pl.LightningModule] = 'cpc_encoder', + encoder_name: str = 'cpc_encoder', patch_size: int = 8, patch_overlap: int = 4, online_ft: int = True, @@ -50,7 +50,7 @@ def __init__( """ Args: datamodule: A Datamodule (optional). Otherwise set the dataloaders directly - encoder: A string for any of the resnets in torchvision, or the original CPC encoder, + encoder_name: A string for any of the resnets in torchvision, or the original CPC encoder, or a custon nn.Module encoder patch_size: How big to make the image patches patch_overlap: How much overlap should each patch have. @@ -66,28 +66,20 @@ def __init__( super().__init__() self.save_hyperparameters() + # HACK - datamodule not pickleable so we remove it from hparams. + # TODO - remove datamodule from init. data should be decoupled from models. + del self.hparams['datamodule'] + self.online_evaluator = self.hparams.online_ft if pretrained: self.hparams.dataset = pretrained self.online_evaluator = True - # link data - # if datamodule is None: - # datamodule = CIFAR10DataModule( - # self.hparams.data_dir, - # num_workers=self.hparams.num_workers, - # batch_size=batch_size - # ) - # datamodule.train_transforms = CPCTrainTransformsCIFAR10() - # datamodule.val_transforms = CPCEvalTransformsCIFAR10() assert datamodule self.datamodule = datamodule - # init encoder - self.encoder = encoder - if isinstance(encoder, str): - self.encoder = self.init_encoder() + self.encoder = self.init_encoder() # info nce loss c, h = self.__compute_final_nb_c(self.hparams.patch_size) @@ -97,20 +89,22 @@ def __init__( self.num_classes = self.datamodule.num_classes if pretrained: - self.load_pretrained(encoder) + self.load_pretrained(self.hparams.encoder_name) + + print(self.hparams) - def load_pretrained(self, encoder): + def load_pretrained(self, encoder_name): available_weights = {'resnet18'} - if encoder in available_weights: - load_pretrained(self, f'CPCV2-{encoder}') - elif available_weights not in available_weights: - rank_zero_warn(f'{encoder} not yet available') + if encoder_name in available_weights: + load_pretrained(self, f'CPCV2-{encoder_name}') + elif encoder_name not in available_weights: + rank_zero_warn(f'{encoder_name} not yet available') def init_encoder(self): dummy_batch = torch.zeros((2, 3, self.hparams.patch_size, self.hparams.patch_size)) - encoder_name = self.hparams.encoder + encoder_name = self.hparams.encoder_name if encoder_name == 'cpc_encoder': return cpc_resnet101(dummy_batch) else: @@ -160,18 +154,16 @@ def training_step(self, batch, batch_nb): nce_loss = self.shared_step(batch) # result - result = pl.TrainResult(nce_loss) - result.log('train_nce_loss', nce_loss) - return result + self.log('train_nce_loss', nce_loss) + return nce_loss def validation_step(self, batch, batch_nb): # calculate loss nce_loss = self.shared_step(batch) # result - result = pl.EvalResult(checkpoint_on=nce_loss) - result.log('val_nce', nce_loss, prog_bar=True) - return result + self.log('val_nce', nce_loss, prog_bar=True) + return nce_loss def shared_step(self, batch): try: diff --git a/pl_bolts/models/self_supervised/simclr/simclr_module.py b/pl_bolts/models/self_supervised/simclr/simclr_module.py index 7fbe562827..582883991a 100644 --- a/pl_bolts/models/self_supervised/simclr/simclr_module.py +++ b/pl_bolts/models/self_supervised/simclr/simclr_module.py @@ -157,16 +157,14 @@ def forward(self, x): def training_step(self, batch, batch_idx): loss = self.shared_step(batch, batch_idx) - result = pl.TrainResult(minimize=loss) - result.log('train_loss', loss, on_epoch=True) - return result + self.log('train_loss', loss, on_epoch=True) + return loss def validation_step(self, batch, batch_idx): loss = self.shared_step(batch, batch_idx) - result = pl.EvalResult(checkpoint_on=loss) - result.log('avg_val_loss', loss) - return result + self.log('avg_val_loss', loss) + return loss def shared_step(self, batch, batch_idx): (img1, img2), y = batch diff --git a/pl_bolts/models/self_supervised/ssl_finetuner.py b/pl_bolts/models/self_supervised/ssl_finetuner.py index d3a3e95377..f07e697a42 100644 --- a/pl_bolts/models/self_supervised/ssl_finetuner.py +++ b/pl_bolts/models/self_supervised/ssl_finetuner.py @@ -59,21 +59,18 @@ def on_train_epoch_start(self) -> None: def training_step(self, batch, batch_idx): loss, acc = self.shared_step(batch) - result = pl.TrainResult(loss) - result.log('train_acc', acc, prog_bar=True) - return result + self.log('train_acc', acc, prog_bar=True) + return loss def validation_step(self, batch, batch_idx): loss, acc = self.shared_step(batch) - result = pl.EvalResult(checkpoint_on=loss, early_stop_on=loss) - result.log_dict({'val_acc': acc, 'val_loss': loss}, prog_bar=True) - return result + self.log_dict({'val_acc': acc, 'val_loss': loss}, prog_bar=True) + return loss def test_step(self, batch, batch_idx): loss, acc = self.shared_step(batch) - result = pl.EvalResult() - result.log_dict({'test_acc': acc, 'test_loss': loss}) - return result + self.log_dict({'test_acc': acc, 'test_loss': loss}) + return loss def shared_step(self, batch): x, y = batch diff --git a/pl_bolts/models/vision/__init__.py b/pl_bolts/models/vision/__init__.py index 8d4ec5084e..e525036d34 100644 --- a/pl_bolts/models/vision/__init__.py +++ b/pl_bolts/models/vision/__init__.py @@ -1,2 +1,2 @@ from pl_bolts.models.vision.pixel_cnn import PixelCNN -from pl_bolts.models.vision.unet import UNet \ No newline at end of file +from pl_bolts.models.vision.unet import UNet diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000..c434a7c377 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pytorch-lightning>=0.10.0 +torch>=1.6 \ No newline at end of file diff --git a/requirements/base.txt b/requirements/base.txt deleted file mode 100644 index 62766e3de2..0000000000 --- a/requirements/base.txt +++ /dev/null @@ -1,2 +0,0 @@ -pytorch-lightning>=0.9.1rc3 -torch>=1.6 \ No newline at end of file diff --git a/requirements/devel.txt b/requirements/devel.txt index 53b6b26d05..3574b167e4 100644 --- a/requirements/devel.txt +++ b/requirements/devel.txt @@ -1,5 +1,5 @@ # install all mandatory dependencies --r ./base.txt +-r ../requirements.txt # install all extra dependencies for full package experience -r ./models.txt diff --git a/requirements/models.txt b/requirements/models.txt index 174ab691fc..a92a7ef6bd 100644 --- a/requirements/models.txt +++ b/requirements/models.txt @@ -1,4 +1,5 @@ torchvision>=0.7 scikit-learn>=0.23 Pillow -opencv-python \ No newline at end of file +opencv-python +gym>=0.17.2 # needed for RL \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index 70b0ce4600..c97d36fc50 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -8,5 +8,4 @@ flake8-black check-manifest twine==1.13.0 -# atari-py==0.2.6 # needed for RL -# gym>=0.17.2 # needed for RL \ No newline at end of file +atari-py==0.2.6 # needed for RL \ No newline at end of file diff --git a/setup.py b/setup.py index 456c6b9153..29931f94ff 100755 --- a/setup.py +++ b/setup.py @@ -19,8 +19,8 @@ import pl_bolts # noqa: E402 -def load_requirements(path_dir=PATH_ROOT, file_name='base.txt', comment_char='#'): - with open(os.path.join(path_dir, 'requirements', file_name), 'r') as file: +def load_requirements(path_dir=PATH_ROOT, file_name='requirements.txt', comment_char='#'): + with open(os.path.join(path_dir, file_name), 'r') as file: lines = [ln.strip() for ln in file.readlines()] reqs = [] for ln in lines: @@ -45,9 +45,9 @@ def load_long_describtion(): extras = { - 'loggers': load_requirements(file_name='loggers.txt'), - 'models': load_requirements(file_name='models.txt'), - 'test': load_requirements(file_name='test.txt'), + 'loggers': load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='loggers.txt'), + 'models': load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='models.txt'), + 'test': load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='test.txt'), } extras['extra'] = extras['models'] + extras['loggers'] extras['dev'] = extras['extra'] + extras['test'] diff --git a/tests/datamodules/test_experience_sources.py b/tests/datamodules/test_experience_sources.py new file mode 100644 index 0000000000..737a1c7150 --- /dev/null +++ b/tests/datamodules/test_experience_sources.py @@ -0,0 +1,321 @@ +from unittest import TestCase +from unittest.mock import Mock + +import gym +import numpy as np +import torch +from torch.utils.data import DataLoader + +from pl_bolts.datamodules.experience_source import ( + BaseExperienceSource, + ExperienceSource, + ExperienceSourceDataset, + Experience, + DiscountedExperienceSource, +) +from pl_bolts.models.rl.common.agents import Agent + + +class DummyAgent(Agent): + def __call__(self, states, device): + return [0] * len(states) + + +class DummyExperienceSource(BaseExperienceSource): + def __iter__(self): + yield torch.ones(3) + + +class TestExperienceSourceDataset(TestCase): + def train_batch(self): + """Returns an iterator used for testing""" + return iter([i for i in range(100)]) + + def test_iterator(self): + """Tests that the iterator returns batches correctly""" + source = ExperienceSourceDataset(self.train_batch) + batch_size = 10 + data_loader = DataLoader(source, batch_size=batch_size) + + for idx, batch in enumerate(data_loader): + self.assertEqual(len(batch), batch_size) + self.assertEqual(batch[0], 0) + self.assertEqual(batch[5], 5) + break + + +class TestBaseExperienceSource(TestCase): + def setUp(self) -> None: + self.net = Mock() + self.agent = DummyAgent(net=self.net) + self.env = gym.make("CartPole-v0") + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.source = DummyExperienceSource(self.env, self.agent) + self.s1 = torch.ones(3) + self.s2 = torch.zeros(3) + + def test_dummy_base_class(self): + """Tests that base class is initialized correctly""" + self.assertTrue(isinstance(self.source.env, gym.Env)) + self.assertTrue(isinstance(self.source.agent, Agent)) + out = next(iter(self.source)) + self.assertTrue(torch.all(out.eq(torch.ones(3)))) + + +class TestExperienceSource(TestCase): + def setUp(self) -> None: + self.net = Mock() + self.agent = DummyAgent(net=self.net) + self.env = [gym.make("CartPole-v0") for _ in range(2)] + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.source = ExperienceSource(self.env, self.agent, n_steps=1) + + self.s1 = torch.ones(3) + self.s2 = torch.zeros(3) + + self.mock_env = Mock() + self.mock_env.step = Mock(return_value=(self.s1, 1, False, Mock())) + + self.exp1 = Experience(state=self.s1, action=1, reward=1, done=False, new_state=self.s2) + self.exp2 = Experience(state=self.s1, action=1, reward=1, done=False, new_state=self.s2) + + def test_init_source(self): + """Test that experience source is setup correctly""" + self.assertEqual(self.source.n_steps, 1) + self.assertIsInstance(self.source.pool, list) + + self.assertEqual(len(self.source.states), len(self.source.pool)) + self.assertEqual(len(self.source.histories), len(self.source.pool)) + self.assertEqual(len(self.source.cur_rewards), len(self.source.pool)) + self.assertEqual(len(self.source.cur_steps), len(self.source.pool)) + + def test_init_single_env(self): + """Test that if a single env is passed that it is wrapped in a list""" + self.source = ExperienceSource(self.mock_env, self.agent) + self.assertIsInstance(self.source.pool, list) + + def test_env_actions(self): + """Assert that a list of actions of shape [num_envs, action_len] is returned""" + actions = self.source.env_actions(self.device) + self.assertEqual(len(actions), len(self.env)) + self.assertTrue(isinstance(actions[0], list)) + + def test_env_step(self): + """Assert that taking a step through a single environment yields a list of history steps""" + actions = [[1], [1]] + env = self.env[0] + exp = self.source.env_step(0, env, actions[0]) + + self.assertTrue(isinstance(exp, Experience)) + + def test_source_next_single_env_single_step(self): + """Test that steps are executed correctly with one environment and 1 step""" + + self.env = [gym.make("CartPole-v0") for _ in range(1)] + self.source = ExperienceSource(self.env, self.agent, n_steps=1) + + for idx, exp in enumerate(self.source.runner(self.device)): + self.assertTrue(isinstance(exp, tuple)) + break + + def test_source_next_single_env_multi_step(self): + """Test that steps are executed correctly with one environment and 2 step""" + + self.env = [gym.make("CartPole-v0") for _ in range(1)] + n_steps = 4 + self.source = ExperienceSource(self.env, self.agent, n_steps=n_steps) + + for idx, exp in enumerate(self.source.runner(self.device)): + self.assertTrue(isinstance(exp, tuple)) + self.assertTrue(len(exp) == n_steps) + break + + def test_source_next_multi_env_single_step(self): + """Test that steps are executed correctly with 2 environment and 1 step""" + + for idx, exp in enumerate(self.source.runner(self.device)): + self.assertTrue(isinstance(exp, tuple)) + self.assertTrue(len(exp) == self.source.n_steps) + break + + def test_source_next_multi_env_multi_step(self): + """Test that steps are executed correctly with 2 environment and 2 step""" + self.source = ExperienceSource(self.env, self.agent, n_steps=2) + + for idx, exp in enumerate(self.source.runner(self.device)): + self.assertTrue(isinstance(exp, tuple)) + self.assertTrue(len(exp) == self.source.n_steps) + break + + def test_source_update_state(self): + """Test that after a step the state is updated""" + + self.env = [gym.make("CartPole-v0") for _ in range(1)] + self.source = ExperienceSource(self.env, self.agent, n_steps=2) + + for idx, exp in enumerate(self.source.runner(self.device)): + self.assertTrue(isinstance(exp, tuple)) + new = np.asarray(exp[-1].new_state) + old = np.asarray(self.source.states[0]) + self.assertTrue(np.array_equal(new, old)) + break + + def test_source_is_done_short_episode(self): + """Test that when done and the history is not full, to return the partial history""" + + self.mock_env.step = Mock(return_value=(self.s1, 1, True, Mock)) + + env = [self.mock_env for _ in range(1)] + self.source = ExperienceSource(env, self.agent, n_steps=2) + + for idx, exp in enumerate(self.source.runner(self.device)): + self.assertTrue(isinstance(exp, tuple)) + self.assertTrue(len(exp) == 1) + break + + def test_source_is_done_2step_episode(self): + """ + Test that when done and the history is full, return the full history, then start to return the tail of + the history + """ + + self.env = [self.mock_env] + self.source = ExperienceSource(self.env, self.agent, n_steps=2) + + self.mock_env.step = Mock(return_value=(self.s1, 1, True, Mock)) + + self.source.histories[0].append(self.exp1) + + for idx, exp in enumerate(self.source.runner(self.device)): + + self.assertTrue(isinstance(exp, tuple)) + + if idx == 0: + self.assertTrue(len(exp) == self.source.n_steps) + elif idx == 1: + self.assertTrue(len(exp) == self.source.n_steps - 1) + self.assertTrue(torch.equal(exp[0].new_state, self.s1)) + + break + + def test_source_is_done_metrics(self): + """Test that when done and the history is full, return the full history""" + + n_steps = 3 + n_envs = 2 + + self.mock_env.step = Mock(return_value=(self.s1, 1, True, Mock)) + + self.env = [self.mock_env for _ in range(2)] + self.source = ExperienceSource(self.env, self.agent, n_steps=3) + + history = self.source.histories[0] + history += [self.exp1, self.exp2, self.exp2] + + for idx, exp in enumerate(self.source.runner(self.device)): + + if idx == n_steps - 1: + self.assertEqual(self.source._total_rewards[0], 1) + self.assertEqual(self.source.total_steps[0], 1) + self.assertEqual(self.source.cur_rewards[0], 0) + self.assertEqual(self.source.cur_steps[0], 0) + elif idx == (3 * n_envs) - 1: + self.assertEqual(self.source.iter_idx, 1) + break + + def test_pop_total_rewards(self): + """Test that pop rewards returns correct rewards""" + self.source._total_rewards = [10, 20, 30] + + rewards = self.source.pop_total_rewards() + + self.assertEqual(rewards, [10, 20, 30]) + + +class TestDiscountedExperienceSource(TestCase): + def setUp(self) -> None: + self.net = Mock() + self.agent = DummyAgent(net=self.net) + self.env = [gym.make("CartPole-v0") for _ in range(2)] + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.n_steps = 3 + self.gamma = 0.9 + self.source = DiscountedExperienceSource( + self.env, self.agent, n_steps=self.n_steps, gamma=self.gamma + ) + + self.state = torch.ones(3) + self.next_state = torch.zeros(3) + self.reward = 1 + + self.exp1 = Experience( + state=self.state, + action=1, + reward=self.reward, + done=False, + new_state=self.next_state, + ) + self.exp2 = Experience( + state=self.next_state, + action=1, + reward=self.reward, + done=False, + new_state=self.state, + ) + + self.env1 = Mock() + self.env1.step = Mock( + return_value=(self.next_state, self.reward, True, self.state) + ) + + def test_init(self): + """Test that experience source is setup correctly""" + self.assertEqual(self.source.n_steps, self.n_steps + 1) + self.assertEqual(self.source.steps, self.n_steps) + self.assertEqual(self.source.gamma, self.gamma) + + def test_source_step(self): + """Tests that the source returns a single experience""" + + for idx, exp in enumerate(self.source.runner(self.device)): + self.assertTrue(isinstance(exp, Experience)) + break + + def test_source_step_done(self): + """Tests that the source returns a single experience""" + + self.source = DiscountedExperienceSource( + self.env1, self.agent, n_steps=self.n_steps + ) + + self.source.histories[0].append(self.exp1) + self.source.histories[0].append(self.exp2) + + for idx, exp in enumerate(self.source.runner(self.device)): + self.assertTrue(isinstance(exp, Experience)) + self.assertTrue(torch.all(torch.eq(exp.new_state, self.next_state))) + break + + def test_source_discounted_return(self): + """ + Tests that the source returns a single experience with discounted rewards + + discounted returns: G(t) = R(t+1) + γ*R(t+2) + γ^2*R(t+3) ... + γ^N-1*R(t+N) + """ + + self.source = DiscountedExperienceSource( + self.env1, self.agent, n_steps=self.n_steps + ) + + self.source.histories[0] += [self.exp1, self.exp2] + + discounted_reward = ( + self.exp1.reward + + (self.source.gamma * self.exp2.reward) + + (self.source.gamma * self.reward) ** 2 + ) + + for idx, exp in enumerate(self.source.runner(self.device)): + self.assertTrue(isinstance(exp, Experience)) + self.assertEqual(exp.reward, discounted_reward) + break diff --git a/tests/datasets/__init__.py b/tests/datasets/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py new file mode 100644 index 0000000000..c7adda3cda --- /dev/null +++ b/tests/datasets/test_datasets.py @@ -0,0 +1,34 @@ +from pl_bolts.datasets import DummyDataset, RandomDataset, RandomDictDataset, RandomDictStringDataset +from torch.utils.data import DataLoader + + +def test_dummy_ds(tmpdir): + ds = DummyDataset((1, 2), num_samples=100) + dl = DataLoader(ds) + + for b in dl: + pass + + +def test_rand_ds(tmpdir): + ds = RandomDataset(32, num_samples=100) + dl = DataLoader(ds) + + for b in dl: + pass + + +def test_rand_dict_ds(tmpdir): + ds = RandomDictDataset(32, num_samples=100) + dl = DataLoader(ds) + + for b in dl: + pass + + +def test_rand_str_dict_ds(tmpdir): + ds = RandomDictStringDataset(32, num_samples=100) + dl = DataLoader(ds) + + for b in dl: + pass diff --git a/tests/losses/test_rl_loss.py b/tests/losses/test_rl_loss.py new file mode 100644 index 0000000000..e02965f84c --- /dev/null +++ b/tests/losses/test_rl_loss.py @@ -0,0 +1,51 @@ +""" +Test RL Loss Functions +""" + +from unittest import TestCase + +import numpy as np +import torch + +from pl_bolts.losses.rl import dqn_loss, double_dqn_loss, per_dqn_loss +from pl_bolts.models.rl.common.networks import CNN +from pl_bolts.models.rl.common.gym_wrappers import make_environment + + +class TestRLLoss(TestCase): + + def setUp(self) -> None: + + self.state = torch.rand(32, 4, 84, 84) + self.next_state = torch.rand(32, 4, 84, 84) + self.action = torch.ones([32]) + self.reward = torch.ones([32]) + self.done = torch.zeros([32]).long() + + self.batch = (self.state, self.action, self.reward, self.done, self.next_state) + + self.env = make_environment("PongNoFrameskip-v4") + self.obs_shape = self.env.observation_space.shape + self.n_actions = self.env.action_space.n + self.net = CNN(self.obs_shape, self.n_actions) + self.target_net = CNN(self.obs_shape, self.n_actions) + + def test_dqn_loss(self): + """Test the dqn loss function""" + + loss = dqn_loss(self.batch, self.net, self.target_net) + self.assertIsInstance(loss, torch.Tensor) + + def test_double_dqn_loss(self): + """Test the double dqn loss function""" + + loss = double_dqn_loss(self.batch, self.net, self.target_net) + self.assertIsInstance(loss, torch.Tensor) + + def test_per_dqn_loss(self): + """Test the double dqn loss function""" + prios = torch.ones([32]) + + loss, batch_weights = per_dqn_loss(self.batch, prios, self.net, self.target_net) + self.assertIsInstance(loss, torch.Tensor) + self.assertIsInstance(batch_weights, np.ndarray) diff --git a/tests/models/rl/__init__.py b/tests/models/rl/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/rl/integration/__init__.py b/tests/models/rl/integration/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/rl/integration/test_policy_models.py b/tests/models/rl/integration/test_policy_models.py new file mode 100644 index 0000000000..3c65af9d2e --- /dev/null +++ b/tests/models/rl/integration/test_policy_models.py @@ -0,0 +1,41 @@ +import argparse +from unittest import TestCase + +import pytorch_lightning as pl + +from pl_bolts.models.rl.reinforce_model import Reinforce +from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient + + +class TestPolicyModels(TestCase): + + def setUp(self) -> None: + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser = VanillaPolicyGradient.add_model_specific_args(parent_parser) + args_list = [ + "--env", "CartPole-v0" + ] + self.hparams = parent_parser.parse_args(args_list) + + self.trainer = pl.Trainer( + gpus=0, + max_steps=100, + max_epochs=100, # Set this as the same as max steps to ensure that it doesn't stop early + val_check_interval=1, # This just needs 'some' value, does not effect training right now + fast_dev_run=True + ) + + def test_reinforce(self): + """Smoke test that the reinforce model runs""" + + model = Reinforce(self.hparams.env) + result = self.trainer.fit(model) + + self.assertEqual(result, 1) + + def test_policy_gradient(self): + """Smoke test that the policy gradient model runs""" + model = VanillaPolicyGradient(self.hparams.env) + result = self.trainer.fit(model) + + self.assertEqual(result, 1) diff --git a/tests/models/rl/integration/test_value_models.py b/tests/models/rl/integration/test_value_models.py new file mode 100644 index 0000000000..f3cbad43ad --- /dev/null +++ b/tests/models/rl/integration/test_value_models.py @@ -0,0 +1,74 @@ +import argparse +from unittest import TestCase + +import pytorch_lightning as pl + +from pl_bolts.models.rl.double_dqn_model import DoubleDQN +from pl_bolts.models.rl.dqn_model import DQN +from pl_bolts.models.rl.dueling_dqn_model import DuelingDQN +from pl_bolts.models.rl.noisy_dqn_model import NoisyDQN +from pl_bolts.models.rl.per_dqn_model import PERDQN + + +class TestValueModels(TestCase): + + def setUp(self) -> None: + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser = pl.Trainer.add_argparse_args(parent_parser) + parent_parser = DQN.add_model_specific_args(parent_parser) + args_list = [ + "--warm_start_size", "100", + "--gpus", "0", + "--env", "PongNoFrameskip-v4", + ] + self.hparams = parent_parser.parse_args(args_list) + + self.trainer = pl.Trainer( + gpus=self.hparams.gpus, + max_steps=100, + max_epochs=100, # Set this as the same as max steps to ensure that it doesn't stop early + val_check_interval=1, # This just needs 'some' value, does not effect training right now + fast_dev_run=True + ) + + def test_dqn(self): + """Smoke test that the DQN model runs""" + model = DQN(self.hparams.env, num_envs=5) + result = self.trainer.fit(model) + + self.assertEqual(result, 1) + + def test_double_dqn(self): + """Smoke test that the Double DQN model runs""" + model = DoubleDQN(self.hparams.env) + result = self.trainer.fit(model) + + self.assertEqual(result, 1) + + def test_dueling_dqn(self): + """Smoke test that the Dueling DQN model runs""" + model = DuelingDQN(self.hparams.env) + result = self.trainer.fit(model) + + self.assertEqual(result, 1) + + def test_noisy_dqn(self): + """Smoke test that the Noisy DQN model runs""" + model = NoisyDQN(self.hparams.env) + result = self.trainer.fit(model) + + self.assertEqual(result, 1) + + def test_per_dqn(self): + """Smoke test that the PER DQN model runs""" + model = PERDQN(self.hparams.env) + result = self.trainer.fit(model) + + self.assertEqual(result, 1) + + # def test_n_step_dqn(self): + # """Smoke test that the N Step DQN model runs""" + # model = DQN(self.hparams.env, n_steps=self.hparams.n_steps) + # result = self.trainer.fit(model) + # + # self.assertEqual(result, 1) diff --git a/tests/models/rl/test_scripts.py b/tests/models/rl/test_scripts.py new file mode 100644 index 0000000000..af1d703897 --- /dev/null +++ b/tests/models/rl/test_scripts.py @@ -0,0 +1,104 @@ +from unittest import mock + +import pytest + + +@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4' + ' --max_steps 10' + ' --fast_dev_run' + ' --warm_start_size 10' + ' --n_steps 2' + ' --batch_size 10']) +def test_cli_run_rl_dqn(cli_args): + """Test running CLI for an example with default params.""" + from pl_bolts.models.rl.dqn_model import cli_main + + cli_args = cli_args.split(' ') if cli_args else [] + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + cli_main() + + +@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4' + ' --max_steps 10' + ' --fast_dev_run' + ' --warm_start_size 10' + ' --n_steps 2' + ' --batch_size 10']) +def test_cli_run_rl_double_dqn(cli_args): + """Test running CLI for an example with default params.""" + from pl_bolts.models.rl.double_dqn_model import cli_main + + cli_args = cli_args.split(' ') if cli_args else [] + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + cli_main() + + +@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4' + ' --max_steps 10' + ' --fast_dev_run' + ' --warm_start_size 10' + ' --n_steps 2' + ' --batch_size 10']) +def test_cli_run_rl_dueling_dqn(cli_args): + """Test running CLI for an example with default params.""" + from pl_bolts.models.rl.dueling_dqn_model import cli_main + + cli_args = cli_args.split(' ') if cli_args else [] + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + cli_main() + + +@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4' + ' --max_steps 10' + ' --fast_dev_run' + ' --warm_start_size 10' + ' --n_steps 2' + ' --batch_size 10']) +def test_cli_run_rl_noisy_dqn(cli_args): + """Test running CLI for an example with default params.""" + from pl_bolts.models.rl.noisy_dqn_model import cli_main + + cli_args = cli_args.split(' ') if cli_args else [] + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + cli_main() + + +@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4' + ' --max_steps 10' + ' --fast_dev_run' + ' --warm_start_size 10' + ' --n_steps 2' + ' --batch_size 10']) +def test_cli_run_rl_per_dqn(cli_args): + """Test running CLI for an example with default params.""" + from pl_bolts.models.rl.per_dqn_model import cli_main + + cli_args = cli_args.split(' ') if cli_args else [] + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + cli_main() + + +@pytest.mark.parametrize('cli_args', ['--env CartPole-v0' + ' --max_steps 10' + ' --fast_dev_run' + ' --batch_size 10']) +def test_cli_run_rl_reinforce(cli_args): + """Test running CLI for an example with default params.""" + from pl_bolts.models.rl.reinforce_model import cli_main + + cli_args = cli_args.split(' ') if cli_args else [] + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + cli_main() + + +@pytest.mark.parametrize('cli_args', ['--env CartPole-v0' + ' --max_steps 10' + ' --fast_dev_run' + ' --batch_size 10']) +def test_cli_run_rl_vanilla_policy_gradient(cli_args): + """Test running CLI for an example with default params.""" + from pl_bolts.models.rl.vanilla_policy_gradient_model import cli_main + + cli_args = cli_args.split(' ') if cli_args else [] + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + cli_main() diff --git a/tests/models/rl/unit/__init__.py b/tests/models/rl/unit/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/rl/unit/test_agents.py b/tests/models/rl/unit/test_agents.py new file mode 100644 index 0000000000..5d4214e59b --- /dev/null +++ b/tests/models/rl/unit/test_agents.py @@ -0,0 +1,62 @@ +"""Tests that the agent module works correctly""" +from unittest import TestCase +from unittest.mock import Mock + +import gym +import numpy as np +import torch + +from pl_bolts.models.rl.common.agents import Agent, PolicyAgent, ValueAgent + + +class TestAgents(TestCase): + + def setUp(self) -> None: + self.env = gym.make("CartPole-v0") + self.state = self.env.reset() + self.net = Mock() + + def test_base_agent(self): + agent = Agent(self.net) + action = agent(self.state, 'cuda:0') + self.assertIsInstance(action, list) + + +class TestValueAgent(TestCase): + + def setUp(self) -> None: + self.env = gym.make("CartPole-v0") + self.net = Mock(return_value=torch.Tensor([[0.0, 100.0]])) + self.state = [self.env.reset()] + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.value_agent = ValueAgent(self.net, self.env.action_space.n) + + def test_value_agent(self): + + action = self.value_agent(self.state, self.device) + self.assertIsInstance(action, list) + self.assertIsInstance(action[0], int) + + def test_value_agent_get_action(self): + action = self.value_agent.get_action(self.state, self.device) + self.assertIsInstance(action, np.ndarray) + self.assertEqual(action[0], 1) + + def test_value_agent_random(self): + action = self.value_agent.get_random_action(self.state) + self.assertIsInstance(action[0], int) + + +class TestPolicyAgent(TestCase): + + def setUp(self) -> None: + self.env = gym.make("CartPole-v0") + self.net = Mock(return_value=torch.Tensor([[0.0, 100.0]])) + self.states = [self.env.reset()] + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + def test_policy_agent(self): + policy_agent = PolicyAgent(self.net) + action = policy_agent(self.states, self.device) + self.assertIsInstance(action, list) + self.assertEqual(action[0], 1) diff --git a/tests/models/rl/unit/test_memory.py b/tests/models/rl/unit/test_memory.py new file mode 100644 index 0000000000..12b89b232e --- /dev/null +++ b/tests/models/rl/unit/test_memory.py @@ -0,0 +1,286 @@ +from unittest import TestCase +from unittest.mock import Mock + +import numpy as np +import torch + +from pl_bolts.models.rl.common.memory import ReplayBuffer, Experience, PERBuffer, MultiStepBuffer, Buffer + + +class TestBuffer(TestCase): + + def train_batch(self): + """Returns an iterator used for testing""" + return iter([i for i in range(100)]) + + def setUp(self) -> None: + self.state = np.random.rand(4, 84, 84) + self.next_state = np.random.rand(4, 84, 84) + self.action = np.ones([1]) + self.reward = np.ones([1]) + self.done = np.zeros([1]) + self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state) + self.source = Mock() + self.source.step = Mock(return_value=(self.experience, torch.tensor(0), False)) + self.batch_size = 8 + self.buffer = Buffer(8) + + for _ in range(self.batch_size): + self.buffer.append(self.experience) + + def test_sample_batch(self): + """check that a sinlge sample is returned""" + sample = self.buffer.sample() + self.assertEqual(len(sample), 5) + self.assertEqual(sample[0].shape, (self.batch_size, 4, 84, 84)) + self.assertEqual(sample[1].shape, (self.batch_size, 1)) + self.assertEqual(sample[2].shape, (self.batch_size, 1)) + self.assertEqual(sample[3].shape, (self.batch_size, 1)) + self.assertEqual(sample[4].shape, (self.batch_size, 4, 84, 84)) + + +class TestReplayBuffer(TestCase): + + def setUp(self) -> None: + self.state = np.random.rand(32, 32) + self.next_state = np.random.rand(32, 32) + self.action = np.ones([1]) + self.reward = np.ones([1]) + self.done = np.zeros([1]) + self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state) + + self.source = Mock() + self.source.step = Mock(return_value=(self.experience, torch.tensor(0), False)) + self.warm_start = 10 + self.buffer = ReplayBuffer(20) + for _ in range(self.warm_start): + self.buffer.append(self.experience) + + def test_replay_buffer_append(self): + """Test that you can append to the replay buffer""" + + self.assertEqual(len(self.buffer), self.warm_start) + + self.buffer.append(self.experience) + + self.assertEqual(len(self.buffer), self.warm_start + 1) + + def test_replay_buffer_populate(self): + """Tests that the buffer is populated correctly with warm_start""" + self.assertEqual(len(self.buffer.buffer), self.warm_start) + + def test_replay_buffer_update(self): + """Tests that buffer append works correctly""" + batch_size = 3 + self.assertEqual(len(self.buffer.buffer), self.warm_start) + for i in range(batch_size): + self.buffer.append(self.experience) + self.assertEqual(len(self.buffer.buffer), self.warm_start + batch_size) + + def test_replay_buffer_sample(self): + """Test that you can sample from the buffer and the outputs are the correct shape""" + batch_size = 3 + + for i in range(10): + self.buffer.append(self.experience) + + batch = self.buffer.sample(batch_size) + + self.assertEqual(len(batch), 5) + + # states + states = batch[0] + self.assertEqual(states.shape, (batch_size, 32, 32)) + # action + actions = batch[1] + self.assertEqual(actions.shape, (batch_size, 1)) + # reward + rewards = batch[2] + self.assertEqual(rewards.shape, (batch_size, 1)) + # dones + dones = batch[3] + self.assertEqual(dones.shape, (batch_size, 1)) + # next states + next_states = batch[4] + self.assertEqual(next_states.shape, (batch_size, 32, 32)) + + +class TestPrioReplayBuffer(TestCase): + + def setUp(self) -> None: + self.buffer = PERBuffer(10) + + self.state = np.random.rand(32, 32) + self.next_state = np.random.rand(32, 32) + self.action = np.ones([1]) + self.reward = np.ones([1]) + self.done = np.zeros([1]) + self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state) + + def test_replay_buffer_append(self): + """Test that you can append to the replay buffer and the latest experience has max priority""" + + self.assertEqual(len(self.buffer), 0) + + self.buffer.append(self.experience) + + self.assertEqual(len(self.buffer), 1) + self.assertEqual(self.buffer.priorities[0], 1.0) + + def test_replay_buffer_sample(self): + """Test that you can sample from the buffer and the outputs are the correct shape""" + batch_size = 3 + + for i in range(10): + self.buffer.append(self.experience) + + batch, indices, weights = self.buffer.sample(batch_size) + + self.assertEqual(len(batch), 5) + self.assertEqual(len(indices), batch_size) + self.assertEqual(len(weights), batch_size) + + # states + states = batch[0] + self.assertEqual(states.shape, (batch_size, 32, 32)) + # action + actions = batch[1] + self.assertEqual(actions.shape, (batch_size, 1)) + # reward + rewards = batch[2] + self.assertEqual(rewards.shape, (batch_size, 1)) + # dones + dones = batch[3] + self.assertEqual(dones.shape, (batch_size, 1)) + # next states + next_states = batch[4] + self.assertEqual(next_states.shape, (batch_size, 32, 32)) + + +class TestMultiStepReplayBuffer(TestCase): + + def setUp(self) -> None: + self.gamma = 0.9 + self.buffer = MultiStepBuffer(capacity=10, n_steps=2, gamma=self.gamma) + + self.state = np.zeros([32, 32]) + self.state_02 = np.ones([32, 32]) + self.next_state = np.zeros([32, 32]) + self.next_state_02 = np.ones([32, 32]) + self.action = np.zeros([1]) + self.action_02 = np.ones([1]) + self.reward = np.zeros([1]) + self.reward_02 = np.ones([1]) + self.done = np.zeros([1]) + self.done_02 = np.zeros([1]) + + self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state) + self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02) + self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02) + + def test_append_single_experience_less_than_n(self): + """ + If a single experience is added and n > 1 nothing should be added to the buffer as it is waiting experiences + to equal n + """ + self.assertEqual(len(self.buffer), 0) + + self.buffer.append(self.experience01) + + self.assertEqual(len(self.buffer), 0) + + def test_append_single_experience(self): + """ + If a single experience is added and n > 1 nothing should be added to the buffer as it is waiting experiences + to equal n + """ + self.assertEqual(len(self.buffer), 0) + + self.buffer.append(self.experience01) + + self.assertEqual(len(self.buffer.exp_history_queue), 0) + self.assertEqual(len(self.buffer.history), 1) + + def test_append_single_experience2(self): + """ + If a single experience is added and the number of experiences collected >= n, the multi step experience should + be added to the full buffer. + """ + self.assertEqual(len(self.buffer), 0) + + self.buffer.append(self.experience01) + self.buffer.append(self.experience02) + + self.assertEqual(len(self.buffer.buffer), 1) + self.assertEqual(len(self.buffer.history), 2) + + def test_sample_single_experience(self): + """if there is only a single experience added, sample should return nothing""" + self.buffer.append(self.experience01) + + with self.assertRaises(Exception) as context: + _ = self.buffer.sample(batch_size=1) + + self.assertIsInstance(context.exception, Exception) + + def test_sample_multi_experience(self): + """if there is only a single experience added, sample should return nothing""" + self.buffer.append(self.experience01) + self.buffer.append(self.experience02) + + batch = self.buffer.sample(batch_size=1) + + next_state = batch[4] + self.assertEqual(next_state.all(), self.next_state_02.all()) + + def test_get_transition_info_2_step(self): + """Test that the accumulated experience is correct and""" + self.buffer.append(self.experience01) + self.buffer.append(self.experience02) + + reward = self.buffer.buffer[0].reward + next_state = self.buffer.buffer[0].new_state + done = self.buffer.buffer[0].done + + reward_gt = self.experience01.reward + (self.gamma * self.experience02.reward) * (1 - done) + + self.assertEqual(reward, reward_gt) + self.assertEqual(next_state.all(), self.next_state_02.all()) + self.assertEqual(self.experience02.done, done) + + def test_get_transition_info_3_step(self): + """Test that the accumulated experience is correct with multi step""" + self.buffer = MultiStepBuffer(capacity=10, n_steps=3, gamma=self.gamma) + + self.buffer.append(self.experience01) + self.buffer.append(self.experience02) + self.buffer.append(self.experience02) + + reward = self.buffer.buffer[0].reward + next_state = self.buffer.buffer[0].new_state + done = self.buffer.buffer[0].done + + reward_01 = self.experience02.reward + self.gamma * self.experience03.reward * (1 - done) + reward_gt = self.experience01.reward + self.gamma * reward_01 * (1 - done) + + self.assertEqual(reward, reward_gt) + self.assertEqual(next_state.all(), self.next_state_02.all()) + self.assertEqual(self.experience03.done, done) + + def test_sample_3_step(self): + """Test that final output of the 3 step sample is correct""" + self.buffer = MultiStepBuffer(capacity=10, n_steps=3, gamma=self.gamma) + + self.buffer.append(self.experience01) + self.buffer.append(self.experience02) + self.buffer.append(self.experience02) + + reward_gt = 1.71 + + batch = self.buffer.sample(1) + + self.assertEqual(batch[0].all(), self.experience01.state.all()) + self.assertEqual(batch[1], self.experience01.action) + self.assertEqual(batch[2], reward_gt) + self.assertEqual(batch[3], self.experience02.done) + self.assertEqual(batch[4].all(), self.experience02.new_state.all()) diff --git a/tests/models/rl/unit/test_reinforce.py b/tests/models/rl/unit/test_reinforce.py new file mode 100644 index 0000000000..655dc2bd54 --- /dev/null +++ b/tests/models/rl/unit/test_reinforce.py @@ -0,0 +1,65 @@ +import argparse +from unittest import TestCase + +import gym +import numpy as np +import torch + +from pl_bolts.datamodules.experience_source import DiscountedExperienceSource +from pl_bolts.models.rl.common.agents import Agent +from pl_bolts.models.rl.common.networks import MLP +from pl_bolts.models.rl.common.gym_wrappers import ToTensor +from pl_bolts.models.rl.reinforce_model import Reinforce + + +class TestReinforce(TestCase): + + def setUp(self) -> None: + self.env = ToTensor(gym.make("CartPole-v0")) + self.obs_shape = self.env.observation_space.shape + self.n_actions = self.env.action_space.n + self.net = MLP(self.obs_shape, self.n_actions) + self.agent = Agent(self.net) + self.exp_source = DiscountedExperienceSource(self.env, self.agent) + + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser = Reinforce.add_model_specific_args(parent_parser) + args_list = [ + "--env", "CartPole-v0", + "--batch_size", "32", + "--gamma", "0.99" + ] + self.hparams = parent_parser.parse_args(args_list) + self.model = Reinforce(**vars(self.hparams)) + + self.rl_dataloader = self.model.train_dataloader() + + def test_loss(self): + """Test the reinforce loss function""" + + batch_states = torch.rand(32, 4) + batch_actions = torch.rand(32).long() + batch_qvals = torch.rand(32) + + loss = self.model.loss(batch_states, batch_actions, batch_qvals) + + self.assertIsInstance(loss, torch.Tensor) + + def test_get_qvals(self): + """Test that given an batch of episodes that it will return a list of qvals for each episode""" + + batch_qvals = [] + rewards = np.ones(32) + out = self.model.calc_qvals(rewards) + batch_qvals.append(out) + + self.assertIsInstance(batch_qvals[0][0], float) + self.assertEqual(batch_qvals[0][0], (batch_qvals[0][1] * self.hparams.gamma) + 1.0) + + def test_calc_q_vals(self): + rewards = np.ones(4) + gt_qvals = [3.9403989999999998, 2.9701, 1.99, 1.0] + + qvals = self.model.calc_qvals(rewards) + + self.assertEqual(gt_qvals, qvals) diff --git a/tests/models/rl/unit/test_vpg.py b/tests/models/rl/unit/test_vpg.py new file mode 100644 index 0000000000..0cbdb5a7c8 --- /dev/null +++ b/tests/models/rl/unit/test_vpg.py @@ -0,0 +1,56 @@ +import argparse +from unittest import TestCase + +import gym +import torch + +from pl_bolts.models.rl.common.agents import Agent +from pl_bolts.models.rl.common.networks import MLP +from pl_bolts.models.rl.common.gym_wrappers import ToTensor +from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient + + +class TestPolicyGradient(TestCase): + + def setUp(self) -> None: + self.env = ToTensor(gym.make("CartPole-v0")) + self.obs_shape = self.env.observation_space.shape + self.n_actions = self.env.action_space.n + self.net = MLP(self.obs_shape, self.n_actions) + self.agent = Agent(self.net) + + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser = VanillaPolicyGradient.add_model_specific_args(parent_parser) + args_list = [ + "--env", "CartPole-v0", + "--batch_size", "32" + ] + self.hparams = parent_parser.parse_args(args_list) + self.model = VanillaPolicyGradient(**vars(self.hparams)) + + def test_loss(self): + """Test the reinforce loss function""" + + batch_states = torch.rand(32, 4) + batch_actions = torch.rand(32).long() + batch_qvals = torch.rand(32) + + loss = self.model.loss(batch_states, batch_actions, batch_qvals) + + self.assertIsInstance(loss, torch.Tensor) + + def test_train_batch(self): + """Tests that a single batch generates correctly""" + + self.model.n_steps = 4 + self.model.batch_size = 1 + xp_dataloader = self.model.train_dataloader() + + batch = next(iter(xp_dataloader)) + self.assertEqual(len(batch), 3) + self.assertEqual(len(batch[0]), self.model.batch_size) + self.assertTrue(isinstance(batch, list)) + self.assertIsInstance(batch[0], torch.Tensor) + self.assertIsInstance(batch[1], list) + self.assertIsInstance(batch[1][0], torch.Tensor) + self.assertIsInstance(batch[2], torch.Tensor) diff --git a/tests/models/rl/unit/test_wrappers.py b/tests/models/rl/unit/test_wrappers.py new file mode 100644 index 0000000000..31e84ada49 --- /dev/null +++ b/tests/models/rl/unit/test_wrappers.py @@ -0,0 +1,19 @@ +from unittest import TestCase + +import gym +import torch + +from pl_bolts.models.rl.common.gym_wrappers import ToTensor + + +class TestToTensor(TestCase): + + def setUp(self) -> None: + self.env = ToTensor(gym.make("CartPole-v0")) + + def test_wrapper(self): + state = self.env.reset() + self.assertIsInstance(state, torch.Tensor) + + new_state, _, _, _ = self.env.step(1) + self.assertIsInstance(new_state, torch.Tensor) diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py index 61edf2e875..a312fbc9d7 100644 --- a/tests/models/test_detection.py +++ b/tests/models/test_detection.py @@ -3,7 +3,7 @@ import torch from torch.utils.data import DataLoader -from pl_bolts.datamodules import DummyDetectionDataset +from pl_bolts.datasets import DummyDetectionDataset from pl_bolts.models.detection import FasterRCNN diff --git a/tests/models/test_mnist_templates.py b/tests/models/test_mnist_templates.py index 7099212cb2..0c8867eb03 100644 --- a/tests/models/test_mnist_templates.py +++ b/tests/models/test_mnist_templates.py @@ -7,11 +7,11 @@ def test_mnist(tmpdir): seed_everything() - model = LitMNIST(data_dir=tmpdir) + model = LitMNIST(data_dir=tmpdir, num_workers=0) trainer = pl.Trainer(limit_train_batches=0.01, limit_val_batches=0.01, max_epochs=1, limit_test_batches=0.01, default_root_dir=tmpdir) trainer.fit(model) trainer.test(model) - loss = trainer.callback_metrics['loss'] + loss = trainer.callback_metrics['train_loss'] - assert loss <= 2.0, 'mnist failed' + assert loss <= 2.2, 'mnist failed' diff --git a/tests/models/test_vision_models.py b/tests/models/test_vision_models.py index 0455a76320..73af207f1a 100644 --- a/tests/models/test_vision_models.py +++ b/tests/models/test_vision_models.py @@ -4,6 +4,7 @@ from pl_bolts.datamodules import MNISTDataModule, FashionMNISTDataModule from pl_bolts.models import GPT2, ImageGPT, UNet + def test_igpt(tmpdir): pl.seed_everything(0) dm = MNISTDataModule(tmpdir, normalize=False) @@ -53,4 +54,3 @@ def test_unet(tmpdir): model = UNet(num_classes=2) y = model(x) assert y.shape == torch.Size([10, 2, 28, 28]) -