diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml
index 285d97b0e5..a6d808a816 100644
--- a/.github/workflows/ci_test-base.yml
+++ b/.github/workflows/ci_test-base.yml
@@ -41,14 +41,14 @@ jobs:
       uses: actions/cache@v2
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
-        key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/base.txt') }}
+        key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements.txt') }}
         restore-keys: |
           ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-
 
     - name: Install dependencies
       run: |
         python -m pip install --upgrade --user pip
-        pip install --requirement ./requirements/base.txt --quiet  --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
+        pip install --requirement ./requirements.txt --quiet  --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
         pip install --requirement ./requirements/test.txt --quiet --upgrade-strategy only-if-needed
         # pip install tox coverage
         python --version
@@ -66,7 +66,7 @@ jobs:
     - name: Test Package [only]
       run: |
         # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003
-        coverage run --source pl_bolts -m pytest pl_bolts -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml --ignore=pl_bolts/datamodules --ignore=pl_bolts/models/self_supervised/amdim/transforms.py
+        coverage run --source pl_bolts -m pytest pl_bolts -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml --ignore=pl_bolts/datamodules --ignore=pl_bolts/models/self_supervised/amdim/transforms.py --ignore=pl_bolts/models/rl
 
     - name: Upload pytest test results
       uses: actions/upload-artifact@master
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index f4b8f72d88..ba7a661a69 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -45,7 +45,7 @@ jobs:
     - name: Set min. dependencies
       if: matrix.requires == 'minimal'
       run: |
-        python -c "fpath = 'requirements/base.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)"
+        python -c "fpath = 'requirements.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)"
         python -c "fpath = 'requirements/models.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)"
         python -c "fpath = 'requirements/loggers.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)"
         python -c "fpath = 'requirements/test.txt' ; req = open(fpath).read().replace('>=', '==') ; open(fpath, 'w').write(req)"
@@ -61,7 +61,7 @@ jobs:
       uses: actions/cache@v2
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
-        key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ hashFiles('requirements/base.txt') }}-${{ hashFiles('requirements/modules.txt') }}
+        key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements/modules.txt') }}
         restore-keys: |
           ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-
 
diff --git a/.github/workflows/code-format.yml b/.github/workflows/code-format.yml
index 62f19d3001..813ee1b862 100644
--- a/.github/workflows/code-format.yml
+++ b/.github/workflows/code-format.yml
@@ -23,14 +23,14 @@ jobs:
         uses: actions/cache@v2
         with:
           path: ~/.cache/pip
-          key: ${{ runner.os }}-pip-${{ hashFiles('requirements/base.txt') }}
+          key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
           restore-keys: |
             ${{ runner.os }}-pip-
 
       - name: Install dependencies
         run: |
           # python -m pip install --upgrade --user pip
-          pip install -r requirements/base.txt -U -f https://download.pytorch.org/whl/torch_stable.html -q
+          pip install -r requirements.txt -U -f https://download.pytorch.org/whl/torch_stable.html -q
           pip install flake8
           python --version
           pip --version
diff --git a/.github/workflows/docs-check.yml b/.github/workflows/docs-check.yml
index 4de81bdc27..4aaac0d41e 100644
--- a/.github/workflows/docs-check.yml
+++ b/.github/workflows/docs-check.yml
@@ -36,7 +36,7 @@ jobs:
 #        uses: actions/cache@v2
 #        with:
 #          path: ~/.cache/pip
-#          key: ${{ runner.os }}-pip-${{ hashFiles('requirements/base.txt') }}
+#          key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
 #          restore-keys: |
 #            ${{ runner.os }}-pip-
 #
@@ -75,13 +75,13 @@ jobs:
         uses: actions/cache@v2
         with:
           path: ~/.cache/pip
-          key: ${{ runner.os }}-pip-${{ hashFiles('requirements/base.txt') }}
+          key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
           restore-keys: |
             ${{ runner.os }}-pip-
 
       - name: Install dependencies
         run: |
-          pip install --requirement requirements/base.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
+          pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
           pip install --requirement docs/requirements.txt
           # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux
           sudo apt-get update && sudo apt-get install -y texlive-latex-extra dvipng texlive-pictures
diff --git a/.gitignore b/.gitignore
index 2c179cd36b..96f7417d80 100644
--- a/.gitignore
+++ b/.gitignore
@@ -138,7 +138,6 @@ MNIST
 
 # Lightning logs
 lightning_logs
-datasets
 *.gz
 *-batches-py
 simclr.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d28abc99a..fea424b9d0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -31,6 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added Linear Regression
 - Added Moco2g
 - Added simclr
+- Added RL module
 - Added Loggers
 - Added Transforms
 - Added Tiny Datasets
@@ -42,12 +43,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- Device is no longer set in the DQN model init
+- Moved RL loss function to the losses module
+- Moved rl.common.experience to datamodules
 - train_batch function to VPG model to generate batch of data at each step (POC)
 - Experience source no longer gets initialized with a device, instead the device is passed at each step()
 - Refactored ExperienceSource classes to be handle multiple environments. 
 
 ### Removed
 
+- Removed N-Step DQN as the latest version of the DQN supports N-Step by setting the `n_step` arg to n
 - Deprecated common.experience
 
 ### Fixed
diff --git a/MANIFEST.in b/MANIFEST.in
index d3f4c4f33d..e306b2618d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -25,6 +25,7 @@ recursive-exclude docs *
 exclude docs
 
 # Include the Requirements
+include requirements.txt
 recursive-include requirements *.txt
 
 # Exclude build configs
diff --git a/README.md b/README.md
index 5e905ce783..28f398f7cc 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ Install bleeding-edge (no guarantees)
 pip install git+https://github.com/PytorchLightning/pytorch-lightning-bolts.git@master --upgrade
 ```
 
-In case you wan to have full experience you can install all optional packages at once
+In case you want to have full experience you can install all optional packages at once
 ```bash
 pip install pytorch-lightning-bolts["extra"]
 ```
diff --git a/docs/source/classic_ml.rst b/docs/source/classic_ml.rst
index d3b3c39712..8a1f8b3aa7 100644
--- a/docs/source/classic_ml.rst
+++ b/docs/source/classic_ml.rst
@@ -9,7 +9,7 @@ half-precision training.
 Linear Regression
 -----------------
 Linear regression fits a linear model between a real-valued target variable :math:`y` and one or more features :math:`X`. We
-estimate the regression coefficients that minimizes the mean squared error between the predicted and true target
+estimate the regression coefficients that minimize the mean squared error between the predicted and true target
 values.
 
 We formulate the linear regression model as a single-layer neural network. By default we include only one neuron in
@@ -69,7 +69,7 @@ Add either L1 or L2 regularization, or both, by specifying the regularization st
 
     trainer.test(test_dataloaders=dm.test_dataloader(batch_size=12))
 
-Any input will be flattened across all dimensions except the firs one (batch).
+Any input will be flattened across all dimensions except the first one (batch).
 This means images, sound, etc... work out of the box.
 
 .. code-block:: python
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f2dc1442d9..7114016ba1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -328,7 +328,7 @@ def package_list_from_file(file):
 MOCK_PACKAGES = []
 if SPHINX_MOCK_REQUIREMENTS:
     # mock also base packages when we are on RTD since we don't install them there
-    MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements', 'base.txt'))
+    MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements.txt'))
     MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements', 'models.txt'))
     MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements', 'loggers.txt'))
 
diff --git a/docs/source/dataloaders.rst b/docs/source/dataloaders.rst
index efe932027b..4101003ba7 100644
--- a/docs/source/dataloaders.rst
+++ b/docs/source/dataloaders.rst
@@ -3,7 +3,10 @@ AsynchronousLoader
 This dataloader behaves identically to the standard pytorch dataloader, but will transfer
 data asynchronously to the GPU with training. You can also use it to wrap an existing dataloader.
 
-Example::
+Example:
+
+.. code-block:: python
+
     dataloader = AsynchronousLoader(DataLoader(ds, batch_size=16), device=device)
 
     for b in dataloader:
@@ -11,11 +14,3 @@ Example::
 
 .. autoclass:: pl_bolts.datamodules.async_dataloader.AsynchronousLoader
    :noindex:
-
-------------------
-
-DummyDataset
-------------
-
-.. autoclass:: pl_bolts.datamodules.dummy_dataset.DummyDataset
-   :noindex:
diff --git a/docs/source/datamodules.rst b/docs/source/datamodules.rst
index 6468326e5c..94c7fc28d8 100644
--- a/docs/source/datamodules.rst
+++ b/docs/source/datamodules.rst
@@ -7,9 +7,9 @@ DataModules (introduced in PyTorch Lightning 0.9.0) decouple the data from a mod
 is simply a collection of a training dataloder, val dataloader and test dataloader. In addition,
 it specifies how to:
 
-- Downloading/preparing data.
+- Download/prepare data.
 - Train/val/test splits.
-- Transforms
+- Transform
 
 Then you can use it like this:
 
diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
new file mode 100644
index 0000000000..4e54095022
--- /dev/null
+++ b/docs/source/datasets.rst
@@ -0,0 +1,41 @@
+########
+Datasets
+########
+Collection of useful datasets
+
+--------
+
+*********
+Debugging
+*********
+Use these datasets to debug
+
+DummyDataset
+============
+
+.. autoclass:: pl_bolts.datasets.dummy_dataset.DummyDataset
+    :noindex:
+
+DummyDetectionDataset
+=====================
+
+.. autoclass:: pl_bolts.datasets.dummy_dataset.DummyDetectionDataset
+    :noindex:
+
+RandomDataset
+=============
+
+.. autoclass:: pl_bolts.datasets.dummy_dataset.RandomDataset
+    :noindex:
+
+RandomDictDataset
+=================
+
+.. autoclass:: pl_bolts.datasets.dummy_dataset.RandomDictDataset
+    :noindex:
+
+RandomDictStringDataset
+=======================
+
+.. autoclass:: pl_bolts.datasets.dummy_dataset.RandomDictStringDataset
+    :noindex:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 001990c191..cf45a1d243 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -33,6 +33,13 @@ PyTorch-Lightning-Bolts documentation
    sklearn_datamodule
    vision_datamodules
 
+.. toctree::
+   :maxdepth: 2
+   :name: datasets
+   :caption: Datasets
+
+   datasets
+
 .. toctree::
    :maxdepth: 2
    :name: dataloaders
@@ -53,10 +60,17 @@ PyTorch-Lightning-Bolts documentation
    :caption: Models
 
    models_howto
-   autoencoders
    classic_ml
+
+.. toctree::
+   :maxdepth: 2
+   :name: vision
+   :caption: Vision models
+
+   autoencoders
    convolutional
    gans
+   reinforce_learn
    self_supervised_models
 
 .. toctree::
@@ -90,6 +104,7 @@ Indices and tables
    readme
    api/pl_bolts.callbacks
    api/pl_bolts.datamodules
+   api/pl_bolts.datasets
    api/pl_bolts.metrics
    api/pl_bolts.models
    api/pl_bolts.callbacks
diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst
index 2ff923a911..a16ba08818 100644
--- a/docs/source/introduction_guide.rst
+++ b/docs/source/introduction_guide.rst
@@ -10,7 +10,7 @@ Bolts is a Deep learning research and production toolbox of:
 - Losses.
 - Datasets.
 
-**The Main goal of bolts is to enable trying new ideas as fast as possible!**
+**The Main goal of Bolts is to enable trying new ideas as fast as possible!**
 
 All models are tested (daily), benchmarked, documented and work on CPUs, TPUs, GPUs and 16-bit precision.
 
@@ -90,11 +90,11 @@ All models are tested (daily), benchmarked, documented and work on CPUs, TPUs, G
 
 Community Built
 ---------------
-Bolts are built-by the Lightning community and contributed to bolts.
+Then lightning community builds bolts and contributes them to Bolts.
 The lightning team guarantees that contributions are:
 
-1. Rigorously Tested (CPUs, GPUs, TPUs).
-2. Rigorously Documented.
+1. Rigorously tested (CPUs, GPUs, TPUs).
+2. Rigorously documented.
 3. Standardized via PyTorch Lightning.
 4. Optimized for speed.
 5. Checked for correctness.
@@ -351,7 +351,7 @@ In case your job or research doesn't need a "hammer", we offer implementations o
 which benefit from lightning's multi-GPU and TPU support.
 
 So, now you can run huge workloads scalably, without needing to do any engineering.
-For instance, here we can run Logistic Regression on Imagenet (each epoch takes about 3 minutes)!
+For instance, here we can run logistic Regression on Imagenet (each epoch takes about 3 minutes)!
 
 .. code-block:: python
 
@@ -414,7 +414,7 @@ But more importantly, you can scale up to many GPUs, TPUs or even CPUs
 
 Logistic Regression
 ^^^^^^^^^^^^^^^^^^^
-Here's an example for Logistic regression
+Here's an example for logistic regression
 
 .. code-block:: python
 
@@ -436,7 +436,7 @@ Here's an example for Logistic regression
 
     trainer.test(test_dataloaders=dm.test_dataloader(batch_size=12))
 
-Any input will be flattened across all dimensions except the firs one (batch).
+Any input will be flattened across all dimensions except the first one (batch).
 This means images, sound, etc... work out of the box.
 
 .. code-block:: python
diff --git a/docs/source/losses.rst b/docs/source/losses.rst
index 3f2b120fee..44b401dfcc 100644
--- a/docs/source/losses.rst
+++ b/docs/source/losses.rst
@@ -10,3 +10,33 @@ This package lists common losses across research domains
 Your Loss
 ---------
 We're cleaning up many of our losses, but in the meantime, submit a PR to add your loss here!
+
+-------------
+
+Reinforcement Learning
+======================
+These are common losses used in RL.
+
+---------------
+
+DQN Loss
+--------
+
+.. autofunction:: pl_bolts.losses.rl.dqn_loss
+    :noindex:
+
+---------------
+
+Double DQN Loss
+---------------
+
+.. autofunction:: pl_bolts.losses.rl.double_dqn_loss
+    :noindex:
+
+---------------
+
+Per DQN Loss
+------------
+
+.. autofunction:: pl_bolts.losses.rl.per_dqn_loss
+    :noindex:
diff --git a/docs/source/models.rst b/docs/source/models.rst
index 09ae1888c5..924b39de4d 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -15,7 +15,7 @@ by adding your contribution to bolts you get these **additional** benefits!
     6. We'll pretrain expensive models for you and host weights.
     7. We will improve the speed of your models!
     8. Eligible for invited talks to discuss your implementation.
-    9. Lightning Swag + involvement in the broader contributor community :)
+    9. Lightning swag + involvement in the broader contributor community :)
 
 .. note:: You still get to keep your attribution and be recognized for your work!
 
@@ -98,7 +98,7 @@ We request that each contribution have:
     - Your name and your team's name as the implementation authors.
     - Your team's affiliation
     - Any generated examples, or result plots.
-    - Hyperparameters configurations for the results.
+    - Hyperparameter configurations for the results.
 
 Thank you for all your amazing contributions!
 
diff --git a/docs/source/reinforce_learn.rst b/docs/source/reinforce_learn.rst
new file mode 100644
index 0000000000..4737b60764
--- /dev/null
+++ b/docs/source/reinforce_learn.rst
@@ -0,0 +1,668 @@
+Reinforcement Learning
+======================
+
+This module is a collection of common RL approaches implemented in Lightning.
+
+-----------------
+
+Module authors
+--------------
+
+Contributions by: `Donal Byrne <https://github.com/djbyrne>`_
+
+- DQN
+- Double DQN
+- Dueling DQN
+- Noisy DQN
+- NStep DQN
+- Prioritized Experience Replay DQN
+- Reinforce
+- Vanilla Policy Gradient
+
+------------
+
+.. note:: 
+    RL models currently only support CPU and single GPU training with `distributed_backend=dp`.
+    Full GPU support will be added in later updates.
+
+
+DQN Models
+----------
+
+The following models are based on DQN. DQN uses value based learning where it is deciding what action to take based
+on the model's current learned value (V), or the state action value (Q) of the current state. These values are defined
+as the discounted total reward of the agents state or state action pair.
+
+---------------
+
+Deep-Q-Network (DQN)
+^^^^^^^^^^^^^^^^^^^^
+
+DQN model introduced in `Playing Atari with Deep Reinforcement Learning <https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf>`_.
+Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller.
+
+Original implementation by: `Donal Byrne <https://github.com/djbyrne>`_
+
+The DQN was introduced in `Playing Atari with Deep Reinforcement Learning <https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf>`_ by
+researchers at DeepMind. This took the concept of tabular Q learning and scaled it to much larger problems by
+apporximating the Q function using a deep neural network.
+
+The goal behind DQN was to take the simple control method of Q learning and scale it up in order to solve complicated \
+tasks. As well as this, the method needed to be stable. The DQN solves these issues with the following additions.
+
+**Approximated Q Function**
+
+Storing Q values in a table works well in theory, but is completely unscalable. Instead, the authors approximate the
+Q function using a deep neural network. This allows the DQN to be used for much more complicated tasks
+
+**Replay Buffer**
+
+Similar to supervised learning, the DQN learns on randomly sampled batches of previous data stored in an
+Experience Replay Buffer. The 'target' is calculated using the Bellman equation
+
+.. math::
+
+    Q(s,a)<-(r+{\gamma}\max_{a'{\in}A}Q(s',a'))^2
+
+and then we optimize using SGD just like a standard supervised learning problem.
+
+.. math::
+
+    L=(Q(s,a)-(r+{\gamma}\max_{a'{\in}A}Q(s',a'))^2
+
+DQN Results
+~~~~~~~~~~~
+
+**DQN: Pong**
+
+.. image:: _images/rl_benchmark/pong_dqn_baseline_results.jpg
+  :width: 800
+  :alt: DQN Baseline Results
+
+Example::
+
+    from pl_bolts.models.rl import DQN
+    dqn = DQN("PongNoFrameskip-v4")
+    trainer = Trainer()
+    trainer.fit(dqn)
+
+.. autoclass:: pl_bolts.models.rl.dqn_model.DQN
+   :noindex:
+
+---------------
+
+Double DQN
+^^^^^^^^^^
+
+Double DQN model introduced in `Deep Reinforcement Learning with Double Q-learning <https://arxiv.org/pdf/1509.06461.pdf>`_
+Paper authors: Hado van Hasselt, Arthur Guez, David Silver
+
+Original implementation by: `Donal Byrne <https://github.com/djbyrne>`_
+
+The original DQN tends to overestimate Q values during the Bellman update, leading to instability and is harmful to
+training. This is due to the max operation in the Bellman equation.
+
+We are constantly taking the max of our agents estimates
+during our update. This may seem reasonable, if we could trust these estimates. However during the early stages of
+training, the estimates for these values will be off center and can lead to instability in training until
+our estimates become more reliable
+
+The Double DQN fixes this overestimation by choosing actions for the next state using the main trained network
+but uses the values of these actions from the more stable target network. So we are still going to take the greedy
+action, but the value will be less "optimisitc" because it is chosen by the target network.
+
+**DQN expected return**
+
+
+.. math::
+
+    Q(s_t, a_t) = r_t + \gamma * \max_{Q'}(S_{t+1}, a)
+
+**Double DQN expected return**
+
+.. math::
+
+    Q(s_t, a_t) = r_t + \gamma * \max{Q'}(S_{t+1}, \arg\max_Q(S_{t+1}, a))
+
+Double DQN Results
+~~~~~~~~~~~~~~~~~~
+
+**Double DQN: Pong**
+
+.. image:: _images/rl_benchmark/pong_double_dqn_baseline_results.jpg
+  :width: 800
+  :alt: Double DQN Result
+
+**DQN vs Double DQN: Pong**
+
+orange: DQN
+
+blue: Double DQN
+
+.. image:: _images/rl_benchmark/dqn_ddqn_comparison.jpg
+  :width: 800
+  :alt: Double DQN Comparison Result
+
+Example::
+
+    from pl_bolts.models.rl import DoubleDQN
+    ddqn = DoubleDQN("PongNoFrameskip-v4")
+    trainer = Trainer()
+    trainer.fit(ddqn)
+
+.. autoclass:: pl_bolts.models.rl.double_dqn_model.DoubleDQN
+   :noindex:
+
+---------------
+
+Dueling DQN
+^^^^^^^^^^^
+
+Dueling DQN model introduced in `Dueling Network Architectures for Deep Reinforcement Learning <https://arxiv.org/abs/1511.06581>`_
+Paper authors: Ziyu Wang, Tom Schaul, Matteo Hessel, Hado van Hasselt, Marc Lanctot, Nando de Freitas
+
+Original implementation by: `Donal Byrne <https://github.com/djbyrne>`_
+
+The Q value that we are trying to approximate can be divided into two parts, the value state V(s) and the 'advantage'
+of actions in that state A(s, a). Instead of having one full network estimate the entire Q value, Dueling DQN uses two
+estimator heads in order to separate the estimation of the two parts.
+
+The value is the same as in value iteration. It is the discounted expected reward achieved from state s. Think of the
+value as the 'base reward' from being in state s.
+
+The advantage tells us how much 'extra' reward we get from taking action a while in state s. The advantage bridges the
+gap between Q(s, a) and V(s) as Q(s, a) = V(s) + A(s, a).
+
+In the paper `Dueling Network Architectures for Deep Reinforcement Learning <https://arxiv.org/abs/1511.06581>` the
+network uses two heads, one outputs the value state and the other outputs the advantage. This leads to better
+training stability, faster convergence and overall better results. The V head outputs a single scalar
+(the state value), while the advantage head outputs a tensor equal to the size of the action space, containing
+an advantage value for each action in state s.
+
+Changing the network architecture is not enough, we also need to ensure that the advantage mean is 0. This is done
+by subtracting the mean advantage from the Q value. This essentially pulls the mean advantage to 0.
+
+.. math::
+
+    Q(s, a) = V(s) + A(s, a) - 1/N * \sum_k(A(s, k)
+
+Dueling DQN Benefits
+~~~~~~~~~~~~~~~~~~~~
+
+- Ability to efficiently learn the state value function. In the dueling network, every Q update also updates the value 
+  stream, where as in DQN only the value of the chosen action is updated. This provides a better approximation of the
+  values
+- The differences between total Q values for a given state are quite small in relation to the magnitude of Q. The
+  difference in the Q values between the best action and the second best action can be very small, while the average
+  state value can be much larger. The differences in scale can introduce noise, which may lead to the greedy policy
+  switching the priority of these actions. The seperate estimators for state value and advantage makes the Dueling
+  DQN robust to this type of scenario
+
+Dueling DQN Results
+~~~~~~~~~~~~~~~~~~~
+
+The results below a noticeable improvement from the original DQN network.
+
+
+**Dueling DQN baseline: Pong**
+
+Similar to the results of the DQN baseline, the agent has a period where the number of steps per episodes increase as
+it begins to hold its own against the heuristic oppoent, but then the steps per episode quickly begins to drop
+as it gets better and starts to beat its opponent faster and faster. There is a noticable point at step ~250k
+where the agent goes from losing to winning.
+
+As you can see by the total rewards, the dueling network's training progression is very stable and continues to trend
+upward until it finally plateus.
+
+.. image:: _images/rl_benchmark/pong_dueling_dqn_results.jpg
+  :width: 800
+  :alt: Dueling DQN Result
+
+**DQN vs Dueling DQN: Pong**
+
+In comparison to the base DQN, we see that the Dueling network's training is much more stable and is able to reach a
+score in the high teens faster than the DQN agent. Even though the Dueling network is more stable and out performs DQN
+early in training, by the end of training the two networks end up at the same point.
+
+This could very well be due to the simplicity of the Pong environment.
+
+ - Orange: DQN
+ - Red: Dueling DQN
+
+.. image:: _images/rl_benchmark/pong_dueling_dqn_comparison.jpg
+  :width: 800
+  :alt: Dueling DQN Comparison Result
+
+Example::
+
+    from pl_bolts.models.rl import DuelingDQN
+    dueling_dqn = DuelingDQN("PongNoFrameskip-v4")
+    trainer = Trainer()
+    trainer.fit(dueling_dqn)
+
+.. autoclass:: pl_bolts.models.rl.dueling_dqn_model.DuelingDQN
+   :noindex:
+
+--------------
+
+Noisy DQN
+^^^^^^^^^
+
+Noisy DQN model introduced in `Noisy Networks for Exploration <https://arxiv.org/abs/1706.10295>`_
+Paper authors: Meire Fortunato, Mohammad Gheshlaghi Azar, Bilal Piot, Jacob Menick, Ian Osband, Alex Graves,
+Vlad Mnih, Remi Munos, Demis Hassabis, Olivier Pietquin, Charles Blundell, Shane Legg
+
+Original implementation by: `Donal Byrne <https://github.com/djbyrne>`_
+
+Up until now the DQN agent uses a seperate exploration policy, generally epsilon-greedy where start and end values
+are set for its exploration. `Noisy Networks For Exploration <https://arxiv.org/abs/1706.10295>` introduces
+a new exploration strategy by adding noise parameters to the weights of the fully connect layers which get updated
+during backpropagation of the network. The noise parameters drive
+the exploration of the network instead of simply taking random actions more frequently at the start of training and
+less frequently towards the end. The of authors of
+propose two ways of doing this.
+
+During the optimization step a new set of noisy parameters are sampled. During training the agent acts according to
+the fixed set of parameters. At the next optimization step, the parameters are updated with a new sample. This ensures
+the agent always acts based on the parameters that are drawn from the current noise
+distribution.
+
+The authors propose two methods of injecting noise to the network.
+
+1) Independent Gaussian Noise: This injects noise per weight. For each weight a random value is taken from
+   the distribution. Noise parameters are stored inside the layer and are updated during backpropagation.
+   The output of the layer is calculated as normal.
+2) Factorized Gaussian Noise: This injects nosier per input/ouput. In order to minimize the number of random values
+   this method stores two random vectors, one with the size of the input and the other with the size of the output.
+   Using these two vectors, a random matrix is generated for the layer by calculating the outer products of the vector
+
+
+Noisy DQN Benefits
+~~~~~~~~~~~~~~~~~~
+
+- Improved exploration function. Instead of just performing completely random actions, we add decreasing amount of noise
+  and uncertainty to our policy allowing to explore while still utilising its policy.
+- The fact that this method is automatically tuned means that we do not have to tune hyper parameters for
+  epsilon-greedy!
+
+.. note::
+    For now I have just implemented the Independant Gaussian as it has been reported there isn't much difference
+    in results for these benchmark environments.
+
+In order to update the basic DQN to a Noisy DQN we need to do the following
+
+Noisy DQN Results
+~~~~~~~~~~~~~~~~~
+
+The results below improved stability and faster performance growth.
+
+**Noisy DQN baseline: Pong**
+
+
+Similar to the other improvements, the average score of the agent reaches positive numbers around the 250k mark and
+steadily increases till convergence.
+
+.. image:: _images/rl_benchmark/pong_noisy_dqn_results.jpg
+  :width: 800
+  :alt: Noisy DQN Result
+
+**DQN vs Dueling DQN: Pong**
+
+In comparison to the base DQN, the Noisy DQN is more stable and is able to converge on an optimal policy much faster
+than the original. It seems that the replacement of the epsilon-greedy strategy with network noise provides a better
+form of exploration.
+
+- Orange: DQN
+- Red: Noisy DQN
+
+.. image:: _images/rl_benchmark/pong_noisy_dqn_comparison.jpg
+  :width: 800
+  :alt: Noisy DQN Comparison Result
+
+Example::
+
+    from pl_bolts.models.rl import NoisyDQN
+    noisy_dqn = NoisyDQN("PongNoFrameskip-v4")
+    trainer = Trainer()
+    trainer.fit(noisy_dqn)
+
+.. autoclass:: pl_bolts.models.rl.noisy_dqn_model.NoisyDQN
+   :noindex:
+
+--------------
+
+N-Step DQN
+^^^^^^^^^^
+
+N-Step DQN model introduced in `Learning to Predict by the Methods of Temporal Differences  <http://incompleteideas.net/papers/sutton-88-with-erratum.pdf>`_
+Paper authors: Richard S. Sutton
+
+Original implementation by: `Donal Byrne <https://github.com/djbyrne>`_
+
+N Step DQN was introduced in `Learning to Predict by the Methods of Temporal Differences
+<http://incompleteideas.net/papers/sutton-88-with-erratum.pdf>`_.
+This method improves upon the original DQN by updating our Q values with the expected reward from multiple steps in the
+future as opposed to the expected reward from the immediate next state. When getting the Q values for a state action
+pair using a single step which looks like this
+
+.. math::
+
+    Q(s_t,a_t)=r_t+{\gamma}\max_aQ(s_{t+1},a_{t+1})
+
+but because the Q function is recursive we can continue to roll this out into multiple steps, looking at the expected 
+return for each step into the future.
+
+.. math::
+
+    Q(s_t,a_t)=r_t+{\gamma}r_{t+1}+{\gamma}^2\max_{a'}Q(s_{t+2},a')
+
+The above example shows a 2-Step look ahead, but this could be rolled out to the end of the episode, which is just
+Monte Carlo learning. Although we could just do a monte carlo update and look forward to the end of the episode, it
+wouldn't be a good idea. Every time we take another step into the future, we are basing our approximation off our
+current policy. For a large portion of training, our policy is going to be less than optimal. For example, at the start
+of training, our policy will be in a state of high exploration, and will be little better than random.
+
+.. note::
+    For each rollout step you must scale the discount factor accordingly by the number of steps. As you can see from the
+    equation above, the second gamma value is to the power of 2. If we rolled this out one step further, we would use
+    gamma to the power of 3 and so.
+
+So if we are aproximating future rewards off a bad policy, chances are those approximations are going to be pretty
+bad and every time we unroll our update equation, the worse it will get. The fact that we are using an off policy
+method like DQN with a large replay buffer will make this even worse, as there is a high chance that we will be
+training on experiences using an old policy that was worse than our current policy.
+
+So we need to strike a balance between looking far enough ahead to improve the convergence of our agent, but not so far
+that are updates become unstable. In general, small values of 2-4 work best.
+
+N-Step Benefits
+~~~~~~~~~~~~~~~
+
+- Multi-Step learning is capable of learning faster than typical 1 step learning methods.
+- Note that this method introduces a new hyperparameter n. Although n=4 is generally a good starting point and provides
+  good results across the board.
+
+N-Step Results
+~~~~~~~~~~~~~~
+
+As expected, the N-Step DQN converges much faster than the standard DQN, however it also adds more instability to the
+loss of the agent. This can be seen in the following experiments.
+
+
+**N-Step DQN: Pong**
+
+The N-Step DQN shows the greatest increase in performance with respect to the other DQN variations.
+After less than 150k steps the agent begins to consistently win games and achieves the top score after ~170K steps.
+This is reflected in the sharp peak of the total episode steps and of course, the total episode rewards.
+
+.. image:: _images/rl_benchmark/pong_nstep_dqn_1.jpg
+  :width: 800
+  :alt: N-Step DQN Result
+
+**DQN vs N-Step DQN: Pong**
+
+This improvement is shown in stark contrast to the base DQN, which only begins to win games after 250k steps and
+requires over twice as many steps (450k) as the N-Step agent to achieve the high score of 21. One important thing to
+notice is the large increase in the loss of the N-Step agent. This is expected as the agent is building
+its expected reward off approximations of the future states. The large the size of N, the greater the instability.
+Previous literature, listed below, shows the best results for the Pong environment with an N step between 3-5.
+For these experiments I opted with an N step of 4.
+
+
+.. image:: _images/rl_benchmark/pong_nstep_dqn_2.jpg
+  :width: 800
+  :alt: N-Step DQN Comparison Results
+
+Example::
+
+    from pl_bolts.models.rl import DQN
+    n_step_dqn = DQN("PongNoFrameskip-v4", n_steps=4)
+    trainer = Trainer()
+    trainer.fit(n_step_dqn)
+
+--------------
+
+Prioritized Experience Replay DQN
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Double DQN model introduced in `Prioritized Experience Replay  <http://incompleteideas.net/papers/sutton-88-with-erratum.pdf>`_
+Paper authors: Tom Schaul, John Quan, Ioannis Antonoglou, David Silver
+
+Original implementation by: `Donal Byrne <https://github.com/djbyrne>`_
+
+The standard DQN uses a buffer to break up the correlation between experiences and uniform random samples for each
+batch. Instead of just randomly sampling from the buffer prioritized experience replay (PER) prioritizes these samples
+based on training loss. This concept was introduced in the paper
+`Prioritized Experience Replay <https://arxiv.org/abs/1511.05952>`__
+
+Essentially we want to train more on the samples that sunrise the agent.
+
+The priority of each sample is defined below where
+
+
+.. math::
+
+    P(i) = P^\alpha_i / \sum_k P_k^\alpha
+
+
+where pi is the priority of the ith sample in the buffer and
+𝛼 is the number that shows how much emphasis we give to the priority. If 𝛼 = 0 , our
+sampling will become uniform as in the classic DQN method. Larger values for 𝛼 put
+more stress on samples with higher priority
+
+Its important that new samples are set to the highest priority so that they are sampled soon. This however introduces
+bias to new samples in our dataset. In order to compensate for this bias, the value of the weight is defined as
+
+.. math::
+
+    w_i=(N . P(i))^{-\beta}
+
+Where beta is a hyper parameter between 0-1. When beta is 1 the bias is fully compensated. However authors noted that
+in practice it is better to start beta with a small value near 0 and slowly increase it to 1.
+
+PER Benefits
+~~~~~~~~~~~~
+
+- The benefits of this technique are that the agent sees more samples that it struggled with and gets more
+  chances to improve upon it.
+
+**Memory Buffer**
+
+
+First step is to replace the standard experience replay buffer with the prioritized experience replay buffer. This
+is pretty large (100+ lines) so I wont go through it here. There are two buffers implemented. The first is a naive
+list based buffer found in memory.PERBuffer and the second is more efficient buffer using a Sum Tree datastructure.
+
+The list based version is simpler, but has a sample complexity of O(N). The Sum Tree in comparison has a complexity
+of O(1) for sampling and O(logN) for updating priorities.
+
+**Update loss function**
+
+The next thing we do is to use the sample weights that we get from PER. Add the following code to the end of the
+loss function. This applies the weights of our sample to the batch loss. Then we return the mean loss and weighted loss
+for each datum, with the addition of a small epsilon value.
+
+
+PER Results
+~~~~~~~~~~~
+
+The results below show improved stability and faster performance growth.
+
+**PER DQN: Pong**
+
+Similar to the other improvements, we see that PER improves the stability of the agents training and shows to converged
+on an optimal policy faster.
+
+.. image:: _images/rl_benchmark/pong_per_dqn_baseline_v1_results.jpg
+  :width: 800
+  :alt: PER DQN Results
+
+**DQN vs PER DQN: Pong**
+
+In comparison to the base DQN, the PER DQN does show improved stability and performance. As expected, the loss
+of the PER DQN is siginificantly lower. This is the main objective of PER by focusing on experiences with high loss.
+
+It is important to note that loss is not the only metric we should be looking at. Although the agent may have very
+low loss during training, it may still perform poorly due to lack of exploration.
+
+.. image:: _images/rl_benchmark/pong_per_dqn_baseline_v1_results_comp.jpg
+  :width: 800
+  :alt: PER DQN Results
+
+- Orange: DQN
+- Pink: PER DQN
+
+Example::
+
+    from pl_bolts.models.rl import PERDQN
+    per_dqn = PERDQN("PongNoFrameskip-v4")
+    trainer = Trainer()
+    trainer.fit(per_dqn)
+
+.. autoclass:: pl_bolts.models.rl.per_dqn_model.PERDQN
+   :noindex:
+
+
+--------------
+
+Policy Gradient Models
+----------------------
+The following models are based on Policy Gradients. Unlike the Q learning models shown before, Policy based models
+do not try and learn the specifc values of state or state action pairs. Instead it cuts out the middle man and
+directly learns the policy distribution. In Policy Gradient models we update our network parameters in the direction
+suggested by our policy gradient in order to find a policy that produces the highest results.
+
+Policy Gradient Key Points:
+    - Outputs a distribution of actions instead of discrete Q values
+    - Optimizes the policy directly, instead of indirectly through the optimization of Q values
+    - The policy distribution of actions allows the model to handle more complex action spaces, such as continuous actions
+    - The policy distribution introduces stochasticity, providing natural exploration to the model
+    - The policy distribution provides a more stable update as a change in weights will only change the total distribution
+      slightly, as opposed to changing weights based on the Q value of state S will change all Q values with similar states.
+    - Policy gradients tend to converge faste, however they are not as sample efficient and generally require more
+      interactions with the environment.
+
+
+--------------
+
+REINFORCE
+^^^^^^^^^
+
+REINFORCE model introduced in `Policy Gradient Methods For Reinforcement Learning With Function Approximation <https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_
+Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour
+
+Original implementation by: `Donal Byrne <https://github.com/djbyrne>`_
+
+REINFORCE is one of the simplest forms of the Policy Gradient method of RL. This method uses a Monte Carlo rollout,
+where its steps through entire episodes of the environment to build up trajectories computing the total rewards. The
+algorithm is as follows:
+
+1. Initialize our network.
+2. Play N full episodes saving the transitions through the environment.
+3. For every step `t` in each episode `k` we calculate the discounted reward of the subsequent steps.
+
+.. math::
+
+    Q_{k,t} = \sum_{i=0}\gamma^i r_i
+
+4. Calculate the loss for all transitions.
+
+.. math::
+
+    L =  - \sum_{k,t} Q_{k,t} \log(\pi(S_{k,t}, A_{k,t}))
+    
+5. Perform SGD on the loss and repeat.
+
+What this loss function is saying is simply that we want to take the log probability of action A at state S given
+our policy (network output). This is then scaled by the discounted reward that we calculated in the previous step.
+We then take the negative of our sum. This is because the loss is minimized during SGD, but we want to
+maximize our policy.
+
+.. note::
+    The current implementation does not actually wait for the batch episodes the complete every time as we pass in a
+    fixed batch size. For the time being we simply use a large batch size to accomodate this. This approach still works
+    well for simple tasks as it still manages to get an accurate Q value by using a large batch size, but it is not
+    as accurate or completely correct. This will be updated in a later version.
+
+
+REINFORCE Benefits
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Simple and straightforward
+
+- Computationally more efficient for simple tasks such as Cartpole than the Value Based methods.
+
+REINFORCE Results
+~~~~~~~~~~~~~~~~~~~~~
+
+Hyperparameters:
+
+- Batch Size: 800
+- Learning Rate: 0.01
+- Episodes Per Batch: 4
+- Gamma: 0.99
+
+TODO: Add results graph
+
+Example::
+
+    from pl_bolts.models.rl import Reinforce
+    reinforce = Reinforce("CartPole-v0")
+    trainer = Trainer()
+    trainer.fit(reinforce)
+
+.. autoclass:: pl_bolts.models.rl.reinforce_model.Reinforce
+   :noindex:
+
+--------------
+
+Vanilla Policy Gradient
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Vanilla Policy Gradient model introduced in `Policy Gradient Methods For Reinforcement Learning With Function Approximation <https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_
+Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour
+
+Original implementation by: `Donal Byrne <https://github.com/djbyrne>`_
+
+Vanilla Policy Gradient (VPG) expands upon the REINFORCE algorithm and improves some of its major issues. The major
+issue with REINFORCE is that it has high variance. This can be improved by subtracting a baseline value from the
+Q values. For  this implementation we use the average reward as our baseline.
+
+Although Policy Gradients are able to explore naturally due to the stochastic nature of the model, the agent can still
+frequently be stuck in a local optima. In order to improve this, VPG adds an entropy term to improve exploration.
+
+.. math::
+
+    H(\pi) = - \sum \pi (a | s) \log \pi (a | s)
+
+To further control the amount of additional entropy in our model we scale the entropy term by a small beta value. The
+scaled entropy is then subtracted from the policy loss.
+
+VPG Benefits
+~~~~~~~~~~~~~~~
+
+- Addition of the baseline reduces variance in the model
+
+- Improved exploration due to entropy bonus
+
+VPG Results
+~~~~~~~~~~~~~~~~
+
+Hyperparameters:
+
+- Batch Size: 8
+- Learning Rate: 0.001
+- N Steps: 10
+- N environments: 4
+- Entropy Beta: 0.01
+- Gamma: 0.99
+
+Example::
+
+    from pl_bolts.models.rl import VanillaPolicyGradient
+    vpg = VanillaPolicyGradient("CartPole-v0")
+    trainer = Trainer()
+    trainer.fit(vpg)
+
+.. autoclass:: pl_bolts.models.rl.vanilla_policy_gradient_model.VanillaPolicyGradient
+   :noindex:
diff --git a/pl_bolts/__init__.py b/pl_bolts/__init__.py
index 271653f015..25f14672bd 100644
--- a/pl_bolts/__init__.py
+++ b/pl_bolts/__init__.py
@@ -2,7 +2,7 @@
 
 import os
 
-__version__ = '0.2.2'
+__version__ = '0.2.5'
 __author__ = 'PyTorchLightning et al.'
 __author_email__ = 'name@pytorchlightning.ai'
 __license__ = 'Apache-2.0'
@@ -45,12 +45,13 @@
 else:
 
     # from pl_bolts.models.mnist_module import LitMNIST
-    from pl_bolts import models, metrics, callbacks, datamodules, transforms
+    from pl_bolts import models, metrics, callbacks, datamodules, transforms, datasets
 
     __all__ = [
         # 'LitMNIST',
         'models',
         'metrics',
         'callbacks',
-        'datamodules'
+        'datamodules',
+        'datasets',
     ]
diff --git a/pl_bolts/datamodules/__init__.py b/pl_bolts/datamodules/__init__.py
index 1dd2e7c9aa..2e3447d2ac 100644
--- a/pl_bolts/datamodules/__init__.py
+++ b/pl_bolts/datamodules/__init__.py
@@ -1,5 +1,4 @@
 from pl_bolts.datamodules.async_dataloader import AsynchronousLoader
-from pl_bolts.datamodules.dummy_dataset import DummyDataset, DummyDetectionDataset
 
 try:
     from pl_bolts.datamodules.binary_mnist_datamodule import BinaryMNISTDataModule
@@ -7,6 +6,11 @@
         CIFAR10DataModule,
         TinyCIFAR10DataModule,
     )
+    from pl_bolts.datamodules.experience_source import (
+        ExperienceSourceDataset,
+        ExperienceSource,
+        DiscountedExperienceSource,
+    )
     from pl_bolts.datamodules.fashion_mnist_datamodule import FashionMNISTDataModule
     from pl_bolts.datamodules.imagenet_datamodule import ImagenetDataModule
     from pl_bolts.datamodules.mnist_datamodule import MNISTDataModule
diff --git a/pl_bolts/datamodules/cifar10_dataset.py b/pl_bolts/datamodules/cifar10_dataset.py
index 63d2f1f744..5ddb44ab36 100644
--- a/pl_bolts/datamodules/cifar10_dataset.py
+++ b/pl_bolts/datamodules/cifar10_dataset.py
@@ -87,6 +87,9 @@ def __init__(
         self.train = train  # training set or test set
         self.transform = transform
 
+        if not _PIL_AVAILABLE:
+            raise ImportError('You want to use PIL.Image for loading but it is not installed yet.')
+
         os.makedirs(self.cached_folder_path, exist_ok=True)
         self.prepare_data(download)
 
diff --git a/pl_bolts/datamodules/dummy_dataset.py b/pl_bolts/datamodules/dummy_dataset.py
deleted file mode 100644
index 771a7fdbb7..0000000000
--- a/pl_bolts/datamodules/dummy_dataset.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import torch
-from torch.utils.data import Dataset, DataLoader
-
-
-class DummyDataset(Dataset):
-    def __init__(self, *shapes, num_samples=10000):
-        """
-        Generate a dummy dataset
-
-        Args:
-            *shapes: list of shapes
-            num_samples: how many samples to use in this dataset
-
-        Example::
-
-            from pl_bolts.datamodules import DummyDataset
-
-            # mnist dims
-            >>> ds = DummyDataset((1, 28, 28), (1,))
-            >>> dl = DataLoader(ds, batch_size=7)
-            ...
-            >>> batch = next(iter(dl))
-            >>> x, y = batch
-            >>> x.size()
-            torch.Size([7, 1, 28, 28])
-            >>> y.size()
-            torch.Size([7, 1])
-        """
-        super().__init__()
-        self.shapes = shapes
-        self.num_samples = num_samples
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, idx):
-        samples = []
-        for shape in self.shapes:
-            sample = torch.rand(*shape)
-            samples.append(sample)
-
-        return samples
-
-
-class DummyDetectionDataset(Dataset):
-    def __init__(
-        self, img_shape=(3, 256, 256), num_boxes=1, num_classes=2, num_samples=10000
-    ):
-        super().__init__()
-        self.img_shape = img_shape
-        self.num_samples = num_samples
-        self.num_boxes = num_boxes
-        self.num_classes = num_classes
-
-    def __len__(self):
-        return self.num_samples
-
-    def _random_bbox(self):
-        c, h, w = self.img_shape
-        xs = torch.randint(w, (2,))
-        ys = torch.randint(h, (2,))
-        return [min(xs), min(ys), max(xs), max(ys)]
-
-    def __getitem__(self, idx):
-        img = torch.rand(self.img_shape)
-        boxes = torch.tensor([self._random_bbox() for _ in range(self.num_boxes)])
-        labels = torch.randint(self.num_classes, (self.num_boxes,))
-        return img, {"boxes": boxes, "labels": labels}
diff --git a/pl_bolts/datamodules/experience_source.py b/pl_bolts/datamodules/experience_source.py
new file mode 100644
index 0000000000..6a4671234f
--- /dev/null
+++ b/pl_bolts/datamodules/experience_source.py
@@ -0,0 +1,278 @@
+"""
+Datamodules for RL models that rely on experiences generated during training
+Based on implementations found here: https://github.com/Shmuma/ptan/blob/master/ptan/experience.py
+"""
+from abc import ABC
+from collections import deque, namedtuple
+from typing import Iterable, Callable, Tuple, List
+
+import torch
+from gym import Env
+from torch.utils.data import IterableDataset
+
+# Datasets
+
+Experience = namedtuple(
+    "Experience", field_names=["state", "action", "reward", "done", "new_state"]
+)
+
+
+class ExperienceSourceDataset(IterableDataset):
+    """
+    Basic experience source dataset. Takes a generate_batch function that returns an iterator.
+    The logic for the experience source and how the batch is generated is defined the Lightning model itself
+    """
+
+    def __init__(self, generate_batch: Callable):
+        self.generate_batch = generate_batch
+
+    def __iter__(self) -> Iterable:
+        iterator = self.generate_batch()
+        return iterator
+
+
+# Experience Sources
+class BaseExperienceSource(ABC):
+    """
+    Simplest form of the experience source
+    Args:
+        env: Environment that is being used
+        agent: Agent being used to make decisions
+    """
+
+    def __init__(self, env, agent) -> None:
+        self.env = env
+        self.agent = agent
+
+    def runner(self) -> Experience:
+        """Iterable method that yields steps from the experience source"""
+        raise NotImplementedError("ExperienceSource has no stepper method implemented")
+
+
+class ExperienceSource(BaseExperienceSource):
+    """
+    Experience source class handling single and multiple environment steps
+    Args:
+        env: Environment that is being used
+        agent: Agent being used to make decisions
+        n_steps: Number of steps to return from each environment at once
+    """
+
+    def __init__(self, env, agent, n_steps: int = 1) -> None:
+        super().__init__(env, agent)
+
+        self.pool = env if isinstance(env, (list, tuple)) else [env]
+        self.exp_history_queue = deque()
+
+        self.n_steps = n_steps
+        self.total_steps = []
+        self.states = []
+        self.histories = []
+        self.cur_rewards = []
+        self.cur_steps = []
+        self.iter_idx = 0
+
+        self._total_rewards = []
+
+        self.init_environments()
+
+    def runner(self, device: torch.device) -> Tuple[Experience]:
+        """Experience Source iterator yielding Tuple of experiences for n_steps. These come from the pool
+        of environments provided by the user.
+        Args:
+            device: current device to be used for executing experience steps
+        Returns:
+            Tuple of Experiences
+        """
+        while True:
+            # get actions for all envs
+            actions = self.env_actions(device)
+
+            # step through each env
+            for env_idx, (env, action) in enumerate(zip(self.pool, actions)):
+
+                exp = self.env_step(env_idx, env, action)
+                history = self.histories[env_idx]
+                history.append(exp)
+                self.states[env_idx] = exp.new_state
+
+                self.update_history_queue(env_idx, exp, history)
+
+                # Yield all accumulated history tuples to model
+                while self.exp_history_queue:
+                    yield self.exp_history_queue.popleft()
+
+            self.iter_idx += 1
+
+    def update_history_queue(self, env_idx, exp, history) -> None:
+        """
+        Updates the experience history queue with the lastest experiences. In the event of an experience step is in
+        the done state, the history will be incrementally appended to the queue, removing the tail of the history
+        each time.
+        Args:
+            env_idx: index of the environment
+            exp: the current experience
+            history: history of experience steps for this environment
+        """
+        # If there is a full history of step, append history to queue
+        if len(history) == self.n_steps:
+            self.exp_history_queue.append(tuple(history))
+
+        if exp.done:
+            if 0 < len(history) < self.n_steps:
+                self.exp_history_queue.append(tuple(history))
+
+            # generate tail of history, incrementally append history to queue
+            while len(history) > 2:
+                history.popleft()
+                self.exp_history_queue.append(tuple(history))
+
+            # when there are only 2 experiences left in the history,
+            # append to the queue then update the env stats and reset the environment
+            if len(history) > 1:
+                self.update_env_stats(env_idx)
+
+                history.popleft()
+                self.exp_history_queue.append(tuple(history))
+
+            # Clear that last tail in the history once all others have been added to the queue
+            history.clear()
+
+    def init_environments(self) -> None:
+        """
+        For each environment in the pool setups lists for tracking history of size n, state, current reward and
+        current step
+        """
+        for env in self.pool:
+            self.states.append(env.reset())
+            self.histories.append(deque(maxlen=self.n_steps))
+            self.cur_rewards.append(0.0)
+            self.cur_steps.append(0)
+
+    def env_actions(self, device) -> List[List[int]]:
+        """
+        For each environment in the pool, get the correct action
+        Returns:
+            List of actions for each env, with size (num_envs, action_size)
+        """
+        actions = []
+        states_actions = self.agent(self.states, device)
+
+        assert len(self.states) == len(states_actions)
+
+        for idx, action in enumerate(states_actions):
+            actions.append(action if isinstance(action, list) else [action])
+
+        return actions
+
+    def env_step(self, env_idx: int, env: Env, action: List[int]) -> Experience:
+        """
+        Carries out a step through the given environment using the given action
+        Args:
+            env_idx: index of the current environment
+            env: env at index env_idx
+            action: action for this environment step
+        Returns:
+            Experience tuple
+        """
+        next_state, r, is_done, _ = env.step(action[0])
+
+        self.cur_rewards[env_idx] += r
+        self.cur_steps[env_idx] += 1
+
+        exp = Experience(state=self.states[env_idx], action=action[0], reward=r, done=is_done, new_state=next_state)
+
+        return exp
+
+    def update_env_stats(self, env_idx: int) -> None:
+        """
+        To be called at the end of the history tail generation during the termination state. Updates the stats
+        tracked for all environments
+        Args:
+            env_idx: index of the environment used to update stats
+        """
+        self._total_rewards.append(self.cur_rewards[env_idx])
+        self.total_steps.append(self.cur_steps[env_idx])
+        self.cur_rewards[env_idx] = 0
+        self.cur_steps[env_idx] = 0
+        self.states[env_idx] = self.pool[env_idx].reset()
+
+    def pop_total_rewards(self) -> List[float]:
+        """
+        Returns the list of the current total rewards collected
+        Returns:
+            list of total rewards for all completed episodes for each environment since last pop
+        """
+        rewards = self._total_rewards
+
+        if rewards:
+            self._total_rewards = []
+            self.total_steps = []
+
+        return rewards
+
+    def pop_rewards_steps(self):
+        """
+        Returns the list of the current total rewards and steps collected
+        Returns:
+            list of total rewards and steps for all completed episodes for each environment since last pop
+        """
+        res = list(zip(self._total_rewards, self.total_steps))
+        if res:
+            self._total_rewards, self.total_steps = [], []
+        return res
+
+
+class DiscountedExperienceSource(ExperienceSource):
+    """Outputs experiences with a discounted reward over N steps"""
+
+    def __init__(self, env: Env, agent, n_steps: int = 1, gamma: float = 0.99):
+        super().__init__(env, agent, (n_steps + 1))
+        self.gamma = gamma
+        self.steps = n_steps
+
+    def runner(self, device: torch.device) -> Experience:
+        """
+        Iterates through experience tuple and calculate discounted experience
+        Args:
+            device: current device to be used for executing experience steps
+        Yields:
+            Discounted Experience
+        """
+        for experiences in super().runner(device):
+            last_exp_state, tail_experiences = self.split_head_tail_exp(experiences)
+
+            total_reward = self.discount_rewards(tail_experiences)
+
+            yield Experience(state=experiences[0].state, action=experiences[0].action,
+                             reward=total_reward, done=experiences[0].done, new_state=last_exp_state)
+
+    def split_head_tail_exp(self, experiences: Tuple[Experience]) -> Tuple[List, Tuple[Experience]]:
+        """
+        Takes in a tuple of experiences and returns the last state and tail experiences based on
+        if the last state is the end of an episode
+        Args:
+            experiences: Tuple of N Experience
+        Returns:
+            last state (Array or None) and remaining Experience
+        """
+        if experiences[-1].done and len(experiences) <= self.steps:
+            last_exp_state = experiences[-1].new_state
+            tail_experiences = experiences
+        else:
+            last_exp_state = experiences[-1].state
+            tail_experiences = experiences[:-1]
+        return last_exp_state, tail_experiences
+
+    def discount_rewards(self, experiences: Tuple[Experience]) -> float:
+        """
+        Calculates the discounted reward over N experiences
+        Args:
+            experiences: Tuple of Experience
+        Returns:
+            total discounted reward
+        """
+        total_reward = 0.0
+        for exp in reversed(experiences):
+            total_reward = (self.gamma * total_reward) + exp.reward
+        return total_reward
diff --git a/pl_bolts/datasets/__init__.py b/pl_bolts/datasets/__init__.py
new file mode 100644
index 0000000000..e2d319ce2f
--- /dev/null
+++ b/pl_bolts/datasets/__init__.py
@@ -0,0 +1,7 @@
+from pl_bolts.datasets.dummy_dataset import (
+    RandomDictStringDataset,
+    RandomDictDataset,
+    RandomDataset,
+    DummyDataset,
+    DummyDetectionDataset
+)
diff --git a/pl_bolts/datasets/dummy_dataset.py b/pl_bolts/datasets/dummy_dataset.py
new file mode 100644
index 0000000000..44b728422e
--- /dev/null
+++ b/pl_bolts/datasets/dummy_dataset.py
@@ -0,0 +1,161 @@
+import torch
+from torch.utils.data import Dataset, DataLoader
+
+
+class DummyDataset(Dataset):
+    """
+    Generate a dummy dataset
+
+    Args:
+        *shapes: list of shapes
+        num_samples: how many samples to use in this dataset
+
+    Example::
+
+        from pl_bolts.datasets import DummyDataset
+
+        >>> # mnist dims
+        >>> ds = DummyDataset((1, 28, 28), (1, ))
+        >>> dl = DataLoader(ds, batch_size=7)
+        >>> # get first batch
+        >>> batch = next(iter(dl))
+        >>> x, y = batch
+        >>> x.size()
+        torch.Size([7, 1, 28, 28])
+        >>> y.size()
+        torch.Size([7, 1])
+    """
+    def __init__(self, *shapes, num_samples: int = 10000):
+        super().__init__()
+        self.shapes = shapes
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx: int):
+        sample = []
+        for shape in self.shapes:
+            spl = torch.rand(*shape)
+            sample.append(spl)
+        return sample
+
+
+class DummyDetectionDataset(Dataset):
+    """
+    Generate a dummy dataset for detection
+
+    Args:
+        *shapes: list of shapes
+        num_samples: how many samples to use in this dataset
+
+    Example::
+
+        from pl_bolts.datasets import DummyDetectionDataset
+
+        >>> ds = DummyDetectionDataset()
+        >>> dl = DataLoader(ds, batch_size=7)
+    """
+    def __init__(
+        self, img_shape: tuple = (3, 256, 256), num_boxes: int = 1, num_classes: int = 2, num_samples: int = 10000
+    ):
+        super().__init__()
+        self.img_shape = img_shape
+        self.num_samples = num_samples
+        self.num_boxes = num_boxes
+        self.num_classes = num_classes
+
+    def __len__(self):
+        return self.num_samples
+
+    def _random_bbox(self):
+        c, h, w = self.img_shape
+        xs = torch.randint(w, (2,))
+        ys = torch.randint(h, (2,))
+        return [min(xs), min(ys), max(xs), max(ys)]
+
+    def __getitem__(self, idx: int):
+        img = torch.rand(self.img_shape)
+        boxes = torch.tensor([self._random_bbox() for _ in range(self.num_boxes)])
+        labels = torch.randint(self.num_classes, (self.num_boxes,))
+        return img, {"boxes": boxes, "labels": labels}
+
+
+class RandomDictDataset(Dataset):
+    """
+    Generate a dummy dataset with a dict structure
+
+    Args:
+        size: tuple
+        num_samples: number of samples
+
+    Example::
+
+        from pl_bolts.datasets import RandomDictDataset
+
+        >>> ds = RandomDictDataset(10)
+        >>> dl = DataLoader(ds, batch_size=7)
+    """
+    def __init__(self, size: int, num_samples: int = 250):
+        self.len = num_samples
+        self.data = torch.randn(num_samples, size)
+
+    def __getitem__(self, index):
+        a = self.data[index]
+        b = a + 2
+        return {'a': a, 'b': b}
+
+    def __len__(self):
+        return self.len
+
+
+class RandomDictStringDataset(Dataset):
+    """
+    Generate a dummy dataset with strings
+
+    Args:
+        size: tuple
+        num_samples: number of samples
+
+    Example::
+
+        from pl_bolts.datasets import RandomDictStringDataset
+
+        >>> ds = RandomDictStringDataset(10)
+        >>> dl = DataLoader(ds, batch_size=7)
+    """
+    def __init__(self, size: int, num_samples: int = 250):
+        self.len = num_samples
+        self.data = torch.randn(num_samples, size)
+
+    def __getitem__(self, index):
+        return {"id": str(index), "x": self.data[index]}
+
+    def __len__(self):
+        return self.len
+
+
+class RandomDataset(Dataset):
+    """
+    Generate a dummy dataset
+
+    Args:
+        size: tuple
+        num_samples: number of samples
+
+    Example::
+
+        from pl_bolts.datasets import RandomDataset
+
+        >>> ds = RandomDataset(10)
+        >>> dl = DataLoader(ds, batch_size=7)
+    """
+    def __init__(self, size: int, num_samples: int = 250):
+        self.len = num_samples
+        self.data = torch.randn(num_samples, size)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
diff --git a/pl_bolts/losses/rl.py b/pl_bolts/losses/rl.py
new file mode 100644
index 0000000000..a4a974f7c6
--- /dev/null
+++ b/pl_bolts/losses/rl.py
@@ -0,0 +1,118 @@
+"""
+Loss functions for the RL models
+"""
+
+from typing import Tuple, List
+
+import numpy as np
+import torch
+from torch import nn
+
+
+def dqn_loss(batch: Tuple[torch.Tensor, torch.Tensor], net: nn.Module,
+             target_net: nn.Module, gamma: float = 0.99) -> torch.Tensor:
+    """
+    Calculates the mse loss using a mini batch from the replay buffer
+    Args:
+        batch: current mini batch of replay data
+        net: main training network
+        target_net: target network of the main training network
+        gamma: discount factor
+    Returns:
+        loss
+    """
+    states, actions, rewards, dones, next_states = batch
+
+    actions = actions.long().squeeze(-1)
+
+    state_action_values = (
+        net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
+    )
+
+    with torch.no_grad():
+        next_state_values = target_net(next_states).max(1)[0]
+        next_state_values[dones] = 0.0
+        next_state_values = next_state_values.detach()
+
+    expected_state_action_values = next_state_values * gamma + rewards
+
+    return nn.MSELoss()(state_action_values, expected_state_action_values)
+
+
+def double_dqn_loss(batch: Tuple[torch.Tensor, torch.Tensor], net: nn.Module,
+                    target_net: nn.Module, gamma: float = 0.99) -> torch.Tensor:
+    """
+    Calculates the mse loss using a mini batch from the replay buffer. This uses an improvement to the original
+    DQN loss by using the double dqn. This is shown by using the actions of the train network to pick the
+    value from the target network. This code is heavily commented in order to explain the process clearly
+    Args:
+        batch: current mini batch of replay data
+        net: main training network
+        target_net: target network of the main training network
+        gamma: discount factor
+    Returns:
+        loss
+    """
+    states, actions, rewards, dones, next_states = batch  # batch of experiences, batch_size = 16
+
+    actions = actions.long().squeeze(-1)
+
+    state_action_values = (
+        net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
+    )
+
+    # dont want to mess with gradients when using the target network
+    with torch.no_grad():
+        next_outputs = net(next_states)  # [16, 2], [batch, action_space]
+
+        next_state_acts = next_outputs.max(1)[1].unsqueeze(
+            -1
+        )  # take action at the index with the highest value
+        next_tgt_out = target_net(next_states)
+
+        # Take the value of the action chosen by the train network
+        next_state_values = next_tgt_out.gather(1, next_state_acts).squeeze(-1)
+        next_state_values[dones] = 0.0  # any steps flagged as done get a 0 value
+        next_state_values = (
+            next_state_values.detach()
+        )  # remove values from the graph, no grads needed
+
+    # calc expected discounted return of next_state_values
+    expected_state_action_values = next_state_values * gamma + rewards
+
+    # Standard MSE loss between the state action values of the current state and the
+    # expected state action values of the next state
+    return nn.MSELoss()(state_action_values, expected_state_action_values)
+
+
+def per_dqn_loss(batch: Tuple[torch.Tensor, torch.Tensor], batch_weights: List, net: nn.Module,
+                 target_net: nn.Module, gamma: float = 0.99) -> Tuple[torch.Tensor, np.ndarray]:
+    """
+    Calculates the mse loss with the priority weights of the batch from the PER buffer
+    Args:
+        batch: current mini batch of replay data
+        batch_weights: how each of these samples are weighted in terms of priority
+        net: main training network
+        target_net: target network of the main training network
+        gamma: discount factor
+    Returns:
+        loss and batch_weights
+    """
+    states, actions, rewards, dones, next_states = batch
+
+    actions = actions.long()
+
+    batch_weights = torch.tensor(batch_weights)
+
+    actions_v = actions.unsqueeze(-1)
+    outputs = net(states)
+    state_action_vals = outputs.gather(1, actions_v)
+    state_action_vals = state_action_vals.squeeze(-1)
+
+    with torch.no_grad():
+        next_s_vals = target_net(next_states).max(1)[0]
+        next_s_vals[dones] = 0.0
+        exp_sa_vals = next_s_vals.detach() * gamma + rewards
+    loss = (state_action_vals - exp_sa_vals) ** 2
+    losses_v = batch_weights * loss
+    return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()
diff --git a/pl_bolts/models/autoencoders/basic_ae/basic_ae_module.py b/pl_bolts/models/autoencoders/basic_ae/basic_ae_module.py
index d554769b7e..66c3a3a113 100644
--- a/pl_bolts/models/autoencoders/basic_ae/basic_ae_module.py
+++ b/pl_bolts/models/autoencoders/basic_ae/basic_ae_module.py
@@ -106,17 +106,15 @@ def step(self, batch, batch_idx):
 
     def training_step(self, batch, batch_idx):
         loss, logs = self.step(batch, batch_idx)
-        result = pl.TrainResult(minimize=loss)
-        result.log_dict(
+        self.log_dict(
             {f"train_{k}": v for k, v in logs.items()}, on_step=True, on_epoch=False
         )
-        return result
+        return loss
 
     def validation_step(self, batch, batch_idx):
         loss, logs = self.step(batch, batch_idx)
-        result = pl.EvalResult(checkpoint_on=loss)
-        result.log_dict({f"val_{k}": v for k, v in logs.items()})
-        return result
+        self.log_dict({f"val_{k}": v for k, v in logs.items()})
+        return loss
 
     def configure_optimizers(self):
         return torch.optim.Adam(self.parameters(), lr=self.lr)
diff --git a/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py b/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py
index 40ba1be428..ab6671d000 100644
--- a/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py
+++ b/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py
@@ -139,17 +139,15 @@ def step(self, batch, batch_idx):
 
     def training_step(self, batch, batch_idx):
         loss, logs = self.step(batch, batch_idx)
-        result = pl.TrainResult(minimize=loss)
-        result.log_dict(
+        self.log_dict(
             {f"train_{k}": v for k, v in logs.items()}, on_step=True, on_epoch=False
         )
-        return result
+        return loss
 
     def validation_step(self, batch, batch_idx):
         loss, logs = self.step(batch, batch_idx)
-        result = pl.EvalResult(checkpoint_on=loss)
-        result.log_dict({f"val_{k}": v for k, v in logs.items()})
-        return result
+        self.log_dict({f"val_{k}": v for k, v in logs.items()})
+        return loss
 
     def configure_optimizers(self):
         return torch.optim.Adam(self.parameters(), lr=self.lr)
diff --git a/pl_bolts/models/gans/basic/basic_gan_module.py b/pl_bolts/models/gans/basic/basic_gan_module.py
index 1459327daf..7311cb260b 100644
--- a/pl_bolts/models/gans/basic/basic_gan_module.py
+++ b/pl_bolts/models/gans/basic/basic_gan_module.py
@@ -136,18 +136,16 @@ def generator_step(self, x):
 
         # log to prog bar on each step AND for the full epoch
         # use the generator loss for checkpointing
-        result = pl.TrainResult(minimize=g_loss, checkpoint_on=g_loss)
-        result.log('g_loss', g_loss, on_epoch=True, prog_bar=True)
-        return result
+        self.log('g_loss', g_loss, on_epoch=True, prog_bar=True)
+        return g_loss
 
     def discriminator_step(self, x):
         # Measure discriminator's ability to classify real from generated samples
         d_loss = self.discriminator_loss(x)
 
         # log to prog bar on each step AND for the full epoch
-        result = pl.TrainResult(minimize=d_loss)
-        result.log('d_loss', d_loss, on_epoch=True, prog_bar=True)
-        return result
+        self.log('d_loss', d_loss, on_epoch=True, prog_bar=True)
+        return d_loss
 
     def configure_optimizers(self):
         lr = self.hparams.learning_rate
diff --git a/pl_bolts/models/mnist_module.py b/pl_bolts/models/mnist_module.py
index 365b481437..3dc71b22b1 100644
--- a/pl_bolts/models/mnist_module.py
+++ b/pl_bolts/models/mnist_module.py
@@ -36,43 +36,20 @@ def training_step(self, batch, batch_idx):
         x, y = batch
         y_hat = self(x)
         loss = F.cross_entropy(y_hat, y)
-        tensorboard_logs = {'train_loss': loss}
-        progress_bar_metrics = tensorboard_logs
-        return {
-            'loss': loss,
-            'log': tensorboard_logs,
-            'progress_bar': progress_bar_metrics
-        }
+        self.log('train_loss', loss)
+        return loss
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         y_hat = self(x)
-        return {'val_loss': F.cross_entropy(y_hat, y)}
-
-    def validation_epoch_end(self, outputs):
-        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
-        tensorboard_logs = {'val_loss': avg_loss}
-        progress_bar_metrics = tensorboard_logs
-        return {
-            'val_loss': avg_loss,
-            'log': tensorboard_logs,
-            'progress_bar': progress_bar_metrics
-        }
+        loss = F.cross_entropy(y_hat, y)
+        self.log('val_loss', loss)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         y_hat = self(x)
-        return {'test_loss': F.cross_entropy(y_hat, y)}
-
-    def test_epoch_end(self, outputs):
-        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
-        tensorboard_logs = {'test_loss': avg_loss}
-        progress_bar_metrics = tensorboard_logs
-        return {
-            'test_loss': avg_loss,
-            'log': tensorboard_logs,
-            'progress_bar': progress_bar_metrics
-        }
+        loss = F.cross_entropy(y_hat, y)
+        self.log('test_loss', loss)
 
     def configure_optimizers(self):
         return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
diff --git a/pl_bolts/models/regression/logistic_regression.py b/pl_bolts/models/regression/logistic_regression.py
index 650d12d4b0..b5df3f1c00 100644
--- a/pl_bolts/models/regression/logistic_regression.py
+++ b/pl_bolts/models/regression/logistic_regression.py
@@ -2,7 +2,7 @@
 
 import pytorch_lightning as pl
 import torch
-from pytorch_lightning.metrics.classification import accuracy
+from pytorch_lightning.metrics.functional import accuracy
 from torch import nn
 from torch.nn import functional as F
 from torch.optim import Adam
diff --git a/pl_bolts/models/rl/__init__.py b/pl_bolts/models/rl/__init__.py
new file mode 100644
index 0000000000..cec3f871c8
--- /dev/null
+++ b/pl_bolts/models/rl/__init__.py
@@ -0,0 +1,10 @@
+try:
+    from pl_bolts.models.rl.double_dqn_model import DoubleDQN
+    from pl_bolts.models.rl.dqn_model import DQN
+    from pl_bolts.models.rl.dueling_dqn_model import DuelingDQN
+    from pl_bolts.models.rl.noisy_dqn_model import NoisyDQN
+    from pl_bolts.models.rl.per_dqn_model import PERDQN
+    from pl_bolts.models.rl.reinforce_model import Reinforce
+    from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient
+except ModuleNotFoundError:
+    pass
diff --git a/pl_bolts/models/rl/common/__init__.py b/pl_bolts/models/rl/common/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/pl_bolts/models/rl/common/agents.py b/pl_bolts/models/rl/common/agents.py
new file mode 100644
index 0000000000..92c5fbb8fa
--- /dev/null
+++ b/pl_bolts/models/rl/common/agents.py
@@ -0,0 +1,131 @@
+"""
+Agent module containing classes for Agent logic
+Based on the implementations found here: https://github.com/Shmuma/ptan/blob/master/ptan/agent.py
+"""
+from abc import ABC
+from typing import List
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Agent(ABC):
+    """Basic agent that always returns 0"""
+
+    def __init__(self, net: nn.Module):
+        self.net = net
+
+    def __call__(self, state: torch.Tensor, device: str, *args, **kwargs) -> List[int]:
+        """
+        Using the given network, decide what action to carry
+        Args:
+            state: current state of the environment
+            device: device used for current batch
+        Returns:
+            action
+        """
+        return [0]
+
+
+class ValueAgent(Agent):
+    """Value based agent that returns an action based on the Q values from the network"""
+
+    def __init__(
+        self,
+        net: nn.Module,
+        action_space: int,
+        eps_start: float = 1.0,
+        eps_end: float = 0.2,
+        eps_frames: float = 1000,
+    ):
+        super().__init__(net)
+        self.action_space = action_space
+        self.eps_start = eps_start
+        self.epsilon = eps_start
+        self.eps_end = eps_end
+        self.eps_frames = eps_frames
+
+    @torch.no_grad()
+    def __call__(self, state: torch.Tensor, device: str) -> List[int]:
+        """
+        Takes in the current state and returns the action based on the agents policy
+        Args:
+            state: current state of the environment
+            device: the device used for the current batch
+        Returns:
+            action defined by policy
+        """
+        if not isinstance(state, list):
+            state = [state]
+
+        if np.random.random() < self.epsilon:
+            action = self.get_random_action(state)
+        else:
+            action = self.get_action(state, device)
+
+        return action
+
+    def get_random_action(self, state: torch.Tensor) -> int:
+        """returns a random action"""
+        actions = []
+
+        for i in range(len(state)):
+            action = np.random.randint(0, self.action_space)
+            actions.append(action)
+
+        return actions
+
+    def get_action(self, state: torch.Tensor, device: torch.device):
+        """
+            Returns the best action based on the Q values of the network
+            Args:
+                state: current state of the environment
+                device: the device used for the current batch
+            Returns:
+                action defined by Q values
+        """
+        if not isinstance(state, torch.Tensor):
+            state = torch.tensor(state, device=device)
+
+        q_values = self.net(state)
+        _, actions = torch.max(q_values, dim=1)
+        return actions.detach().cpu().numpy()
+
+    def update_epsilon(self, step: int) -> None:
+        """
+        Updates the epsilon value based on the current step
+        Args:
+            step: current global step
+        """
+        self.epsilon = max(self.eps_end, self.eps_start - (step + 1) / self.eps_frames)
+
+
+class PolicyAgent(Agent):
+    """Policy based agent that returns an action based on the networks policy"""
+
+    @torch.no_grad()
+    def __call__(self, states: torch.Tensor, device: str) -> List[int]:
+        """
+        Takes in the current state and returns the action based on the agents policy
+        Args:
+            states: current state of the environment
+            device: the device used for the current batch
+        Returns:
+            action defined by policy
+        """
+        if not isinstance(states, list):
+            states = [states]
+
+        if not isinstance(states, torch.Tensor):
+            states = torch.tensor(states, device=device)
+
+        # get the logits and pass through softmax for probability distribution
+        probabilities = F.softmax(self.net(states)).squeeze(dim=-1)
+        prob_np = probabilities.data.cpu().numpy()
+
+        # take the numpy values and randomly select action based on prob distribution
+        actions = [np.random.choice(len(prob), p=prob) for prob in prob_np]
+
+        return actions
diff --git a/pl_bolts/models/rl/common/cli.py b/pl_bolts/models/rl/common/cli.py
new file mode 100644
index 0000000000..a663c8acd8
--- /dev/null
+++ b/pl_bolts/models/rl/common/cli.py
@@ -0,0 +1,34 @@
+"""Contains generic arguments used for all models"""
+
+import argparse
+
+
+def add_base_args(parent) -> argparse.ArgumentParser:
+    """
+    Adds arguments for DQN model
+
+    Note: these params are fine tuned for Pong env
+
+    Args:
+        parent
+    """
+    arg_parser = argparse.ArgumentParser(parents=[parent])
+
+    arg_parser.add_argument("--algo", type=str, default="dqn", help="algorithm to use for training")
+    arg_parser.add_argument("--batch_size", type=int, default=32, help="size of the batches")
+    arg_parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
+
+    arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag")
+    arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor")
+
+    arg_parser.add_argument("--episode_length", type=int, default=500, help="max length of an episode")
+    arg_parser.add_argument("--max_episode_reward", type=int, default=18, help="max episode reward in the environment")
+    arg_parser.add_argument("--n_steps", type=int, default=4, help="how many steps to unroll for each update",)
+    arg_parser.add_argument("--seed", type=int, default=123, help="seed for training run")
+    arg_parser.add_argument("--epoch_len", type=int, default=1000, help="how many batches per epoch")
+    arg_parser.add_argument("--num_envs", type=int, default=1, help="number of environments to run at once")
+    arg_parser.add_argument("--avg_reward_len", type=int, default=100,
+                            help="how many episodes to include in avg reward")
+
+    arg_parser.add_argument("--seed", type=int, default=123, help="seed for training run")
+    return arg_parser
diff --git a/pl_bolts/models/rl/common/gym_wrappers.py b/pl_bolts/models/rl/common/gym_wrappers.py
new file mode 100644
index 0000000000..8f492a27c1
--- /dev/null
+++ b/pl_bolts/models/rl/common/gym_wrappers.py
@@ -0,0 +1,207 @@
+"""
+Set of wrapper functions for gym environments taken from
+https://github.com/Shmuma/ptan/blob/master/ptan/common/wrappers.py
+"""
+import collections
+from warnings import warn
+
+import gym
+import gym.spaces
+import numpy as np
+import torch
+try:
+    import cv2
+except ModuleNotFoundError:
+    warn('You want to use `openCV` which is not installed yet,'  # pragma: no-cover
+         ' install it with `pip install opencv-python`.')
+    _OPENCV_AVAILABLE = False
+else:
+    _OPENCV_AVAILABLE = True
+
+
+class ToTensor(gym.Wrapper):
+    """For environments where the user need to press FIRE for the game to start."""
+
+    def __init__(self, env=None):
+        super(ToTensor, self).__init__(env)
+
+    def step(self, action):
+        """Take 1 step and cast to tensor"""
+        state, reward, done, info = self.env.step(action)
+        return torch.tensor(state), torch.tensor(reward), done, info
+
+    def reset(self):
+        """reset the env and cast to tensor"""
+        return torch.tensor(self.env.reset())
+
+
+class FireResetEnv(gym.Wrapper):
+    """For environments where the user need to press FIRE for the game to start."""
+
+    def __init__(self, env=None):
+        super(FireResetEnv, self).__init__(env)
+        assert env.unwrapped.get_action_meanings()[1] == "FIRE"
+        assert len(env.unwrapped.get_action_meanings()) >= 3
+
+    def step(self, action):
+        """Take 1 step"""
+        return self.env.step(action)
+
+    def reset(self):
+        """reset the env"""
+        self.env.reset()
+        obs, _, done, _ = self.env.step(1)
+        if done:
+            self.env.reset()
+        obs, _, done, _ = self.env.step(2)
+        if done:
+            self.env.reset()
+        return obs
+
+
+class MaxAndSkipEnv(gym.Wrapper):
+    """Return only every `skip`-th frame"""
+
+    def __init__(self, env=None, skip=4):
+        super(MaxAndSkipEnv, self).__init__(env)
+        # most recent raw observations (for max pooling across time steps)
+        self._obs_buffer = collections.deque(maxlen=2)
+        self._skip = skip
+
+    def step(self, action):
+        """take 1 step"""
+        total_reward = 0.0
+        done = None
+        for _ in range(self._skip):
+            obs, reward, done, info = self.env.step(action)
+            self._obs_buffer.append(obs)
+            total_reward += reward
+            if done:
+                break
+        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
+        return max_frame, total_reward, done, info
+
+    def reset(self):
+        """Clear past frame buffer and init. to first obs. from inner env."""
+        self._obs_buffer.clear()
+        obs = self.env.reset()
+        self._obs_buffer.append(obs)
+        return obs
+
+
+class ProcessFrame84(gym.ObservationWrapper):
+    """preprocessing images from env"""
+
+    def __init__(self, env=None):
+
+        if not _OPENCV_AVAILABLE:
+            raise ModuleNotFoundError('This class uses OpenCV which it is not installed yet.')
+
+        super(ProcessFrame84, self).__init__(env)
+        self.observation_space = gym.spaces.Box(
+            low=0, high=255, shape=(84, 84, 1), dtype=np.uint8
+        )
+
+    def observation(self, obs):
+        """preprocess the obs"""
+        return ProcessFrame84.process(obs)
+
+    @staticmethod
+    def process(frame):
+        """image preprocessing, formats to 84x84"""
+        if frame.size == 210 * 160 * 3:
+            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
+        elif frame.size == 250 * 160 * 3:
+            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
+        else:
+            assert False, "Unknown resolution."
+        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
+        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
+        x_t = resized_screen[18:102, :]
+        x_t = np.reshape(x_t, [84, 84, 1])
+        return x_t.astype(np.uint8)
+
+
+class ImageToPyTorch(gym.ObservationWrapper):
+    """converts image to pytorch format"""
+
+    def __init__(self, env):
+
+        if not _OPENCV_AVAILABLE:
+            raise ModuleNotFoundError('This class uses OpenCV which it is not installed yet.')
+
+        super(ImageToPyTorch, self).__init__(env)
+        old_shape = self.observation_space.shape
+        new_shape = (old_shape[-1], old_shape[0], old_shape[1])
+        self.observation_space = gym.spaces.Box(
+            low=0.0, high=1.0, shape=new_shape, dtype=np.float32
+        )
+
+    @staticmethod
+    def observation(observation):
+        """convert observation"""
+        return np.moveaxis(observation, 2, 0)
+
+
+class ScaledFloatFrame(gym.ObservationWrapper):
+    """scales the pixels"""
+
+    @staticmethod
+    def observation(obs):
+        return np.array(obs).astype(np.float32) / 255.0
+
+
+class BufferWrapper(gym.ObservationWrapper):
+    """"Wrapper for image stacking"""
+
+    def __init__(self, env, n_steps, dtype=np.float32):
+        super(BufferWrapper, self).__init__(env)
+        self.dtype = dtype
+        self.buffer = None
+        old_space = env.observation_space
+        self.observation_space = gym.spaces.Box(
+            old_space.low.repeat(n_steps, axis=0),
+            old_space.high.repeat(n_steps, axis=0),
+            dtype=dtype,
+        )
+
+    def reset(self):
+        """reset env"""
+        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
+        return self.observation(self.env.reset())
+
+    def observation(self, observation):
+        """convert observation"""
+        self.buffer[:-1] = self.buffer[1:]
+        self.buffer[-1] = observation
+        return self.buffer
+
+
+class DataAugmentation(gym.ObservationWrapper):
+    """
+    Carries out basic data augmentation on the env observations
+    - ToTensor
+    - GrayScale
+    - RandomCrop
+    """
+
+    def __init__(self, env=None):
+        super().__init__(env)
+        self.observation_space = gym.spaces.Box(
+            low=0, high=255, shape=(84, 84, 1), dtype=np.uint8
+        )
+
+    def observation(self, obs):
+        """preprocess the obs"""
+        return ProcessFrame84.process(obs)
+
+
+def make_environment(env_name):
+    """Convert environment with wrappers"""
+    env = gym.make(env_name)
+    env = MaxAndSkipEnv(env)
+    env = FireResetEnv(env)
+    env = ProcessFrame84(env)
+    env = ImageToPyTorch(env)
+    env = BufferWrapper(env, 4)
+    return ScaledFloatFrame(env)
diff --git a/pl_bolts/models/rl/common/memory.py b/pl_bolts/models/rl/common/memory.py
new file mode 100644
index 0000000000..0cd058ee43
--- /dev/null
+++ b/pl_bolts/models/rl/common/memory.py
@@ -0,0 +1,313 @@
+"""Series of memory buffers sued"""
+
+# Named tuple for storing experience steps gathered in training
+import collections
+from collections import deque, namedtuple
+from typing import Tuple, List, Union
+
+import numpy as np
+
+Experience = namedtuple(
+    "Experience", field_names=["state", "action", "reward", "done", "new_state"]
+)
+
+
+class Buffer:
+    """
+    Basic Buffer for storing a single experience at a time
+    Args:
+        capacity: size of the buffer
+    """
+
+    def __init__(self, capacity: int) -> None:
+        self.buffer = deque(maxlen=capacity)
+
+    def __len__(self) -> None:
+        return len(self.buffer)
+
+    def append(self, experience: Experience) -> None:
+        """
+        Add experience to the buffer
+        Args:
+            experience: tuple (state, action, reward, done, new_state)
+        """
+        self.buffer.append(experience)
+
+    # pylint: disable=unused-argument
+    def sample(self, *args) -> Union[Tuple, List[Tuple]]:
+        """
+        returns everything in the buffer so far it is then reset
+        Returns:
+            a batch of tuple np arrays of state, action, reward, done, next_state
+        """
+        states, actions, rewards, dones, next_states = zip(
+            *[self.buffer[idx] for idx in range(self.__len__())]
+        )
+
+        self.buffer.clear()
+
+        return (
+            np.array(states),
+            np.array(actions),
+            np.array(rewards, dtype=np.float32),
+            np.array(dones, dtype=np.bool),
+            np.array(next_states),
+        )
+
+
+class ReplayBuffer(Buffer):
+    """
+    Replay Buffer for storing past experiences allowing the agent to learn from them
+    """
+
+    def sample(self, batch_size: int) -> Tuple:
+        """
+        Takes a sample of the buffer
+        Args:
+            batch_size: current batch_size
+        Returns:
+            a batch of tuple np arrays of state, action, reward, done, next_state
+        """
+
+        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
+        states, actions, rewards, dones, next_states = zip(
+            *[self.buffer[idx] for idx in indices]
+        )
+
+        return (
+            np.array(states),
+            np.array(actions),
+            np.array(rewards, dtype=np.float32),
+            np.array(dones, dtype=np.bool),
+            np.array(next_states),
+        )
+
+
+class MultiStepBuffer(ReplayBuffer):
+    """
+    N Step Replay Buffer
+
+    Args:
+        capacity: max number of experiences that will be stored in the buffer
+        n_steps: number of steps used for calculating discounted reward/experience
+        gamma: discount factor when calculating n_step discounted reward of the experience being stored in buffer
+    """
+
+    def __init__(self, capacity: int, n_steps: int = 1, gamma: float = 0.99) -> None:
+        super().__init__(capacity)
+
+        self.n_steps = n_steps
+        self.gamma = gamma
+        self.history = deque(maxlen=self.n_steps)
+        self.exp_history_queue = deque()
+
+    def append(self, exp: Experience) -> None:
+        """
+        Add experience to the buffer
+        Args:
+            exp: tuple (state, action, reward, done, new_state)
+        """
+        self.update_history_queue(exp)  # add single step experience to history
+        while self.exp_history_queue:  # go through all the n_steps that have been queued
+            experiences = self.exp_history_queue.popleft()  # get the latest n_step experience from queue
+
+            last_exp_state, tail_experiences = self.split_head_tail_exp(experiences)
+
+            total_reward = self.discount_rewards(tail_experiences)
+
+            n_step_exp = Experience(state=experiences[0].state, action=experiences[0].action, reward=total_reward,
+                                    done=experiences[0].done, new_state=last_exp_state)
+
+            self.buffer.append(n_step_exp)  # add n_step experience to buffer
+
+    def update_history_queue(self, exp) -> None:
+        """
+        Updates the experience history queue with the lastest experiences. In the event of an experience step is in
+        the done state, the history will be incrementally appended to the queue, removing the tail of the history
+        each time.
+        Args:
+            env_idx: index of the environment
+            exp: the current experience
+            history: history of experience steps for this environment
+        """
+        self.history.append(exp)
+
+        # If there is a full history of step, append history to queue
+        if len(self.history) == self.n_steps:
+            self.exp_history_queue.append(list(self.history))
+
+        if exp.done:
+            if 0 < len(self.history) < self.n_steps:
+                self.exp_history_queue.append(list(self.history))
+
+            # generate tail of history, incrementally append history to queue
+            while len(self.history) > 2:
+                self.history.popleft()
+                self.exp_history_queue.append(list(self.history))
+
+            # when there are only 2 experiences left in the history,
+            # append to the queue then update the env stats and reset the environment
+            if len(self.history) > 1:
+                self.history.popleft()
+                self.exp_history_queue.append(list(self.history))
+
+            # Clear that last tail in the history once all others have been added to the queue
+            self.history.clear()
+
+    def split_head_tail_exp(self, experiences: Tuple[Experience]) -> Tuple[List, Tuple[Experience]]:
+        """
+        Takes in a tuple of experiences and returns the last state and tail experiences based on
+        if the last state is the end of an episode
+        Args:
+            experiences: Tuple of N Experience
+        Returns:
+            last state (Array or None) and remaining Experience
+        """
+        last_exp_state = experiences[-1].new_state
+        tail_experiences = experiences
+
+        if experiences[-1].done and len(experiences) <= self.n_steps:
+            tail_experiences = experiences
+
+        return last_exp_state, tail_experiences
+
+    def discount_rewards(self, experiences: Tuple[Experience]) -> float:
+        """
+        Calculates the discounted reward over N experiences
+        Args:
+            experiences: Tuple of Experience
+        Returns:
+            total discounted reward
+        """
+        total_reward = 0.0
+        for exp in reversed(experiences):
+            total_reward = (self.gamma * total_reward) + exp.reward
+        return total_reward
+
+
+class MeanBuffer:
+    """
+    Stores a deque of items and calculates the mean
+    """
+
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.deque = collections.deque(maxlen=capacity)
+        self.sum = 0.0
+
+    def add(self, val: float) -> None:
+        """Add to the buffer"""
+        if len(self.deque) == self.capacity:
+            self.sum -= self.deque[0]
+        self.deque.append(val)
+        self.sum += val
+
+    def mean(self) -> float:
+        """Retrieve the mean"""
+        if not self.deque:
+            return 0.0
+        return self.sum / len(self.deque)
+
+
+class PERBuffer(ReplayBuffer):
+    """
+    simple list based Prioritized Experience Replay Buffer
+    Based on implementation found here:
+    https://github.com/Shmuma/ptan/blob/master/ptan/experience.py#L371
+    """
+
+    def __init__(self, buffer_size, prob_alpha=0.6, beta_start=0.4, beta_frames=100000):
+        super().__init__(capacity=buffer_size)
+        self.beta_start = beta_start
+        self.beta = beta_start
+        self.beta_frames = beta_frames
+        self.prob_alpha = prob_alpha
+        self.capacity = buffer_size
+        self.pos = 0
+        self.buffer = []
+        self.priorities = np.zeros((buffer_size,), dtype=np.float32)
+
+    def update_beta(self, step) -> float:
+        """
+        Update the beta value which accounts for the bias in the PER
+        Args:
+            step: current global step
+        Returns:
+            beta value for this indexed experience
+        """
+        beta_val = self.beta_start + step * (1.0 - self.beta_start) / self.beta_frames
+        self.beta = min(1.0, beta_val)
+
+        return self.beta
+
+    def append(self, exp) -> None:
+        """
+        Adds experiences from exp_source to the PER buffer
+        Args:
+            exp: experience tuple being added to the buffer
+        """
+        # what is the max priority for new sample
+        max_prio = self.priorities.max() if self.buffer else 1.0
+
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(exp)
+        else:
+            self.buffer[self.pos] = exp
+
+        # the priority for the latest sample is set to max priority so it will be resampled soon
+        self.priorities[self.pos] = max_prio
+
+        # update position, loop back if it reaches the end
+        self.pos = (self.pos + 1) % self.capacity
+
+    def sample(self, batch_size=32) -> Tuple:
+        """
+        Takes a prioritized sample from the buffer
+        Args:
+            batch_size: size of sample
+        Returns:
+            sample of experiences chosen with ranked probability
+        """
+        # get list of priority rankings
+        if len(self.buffer) == self.capacity:
+            prios = self.priorities
+        else:
+            prios = self.priorities[: self.pos]
+
+        # probability to the power of alpha to weight how important that probability it, 0 = normal distirbution
+        probs = prios ** self.prob_alpha
+        probs /= probs.sum()
+
+        # choise sample of indices based on the priority prob distribution
+        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
+        # samples = [self.buffer[idx] for idx in indices]
+        states, actions, rewards, dones, next_states = zip(
+            *[self.buffer[idx] for idx in indices]
+        )
+
+        samples = (
+            np.array(states),
+            np.array(actions),
+            np.array(rewards, dtype=np.float32),
+            np.array(dones, dtype=np.bool),
+            np.array(next_states),
+        )
+        total = len(self.buffer)
+
+        # weight of each sample datum to compensate for the bias added in with prioritising samples
+        weights = (total * probs[indices]) ** (-self.beta)
+        weights /= weights.max()
+
+        # return the samples, the indices chosen and the weight of each datum in the sample
+        return samples, indices, np.array(weights, dtype=np.float32)
+
+    def update_priorities(self, batch_indices: List, batch_priorities: List) -> None:
+        """
+        Update the priorities from the last batch, this should be called after the loss for this batch has been
+        calculated.
+        Args:
+            batch_indices: index of each datum in the batch
+            batch_priorities: priority of each datum in the batch
+        """
+        for idx, prio in zip(batch_indices, batch_priorities):
+            self.priorities[idx] = prio
diff --git a/pl_bolts/models/rl/common/networks.py b/pl_bolts/models/rl/common/networks.py
new file mode 100644
index 0000000000..4776424d39
--- /dev/null
+++ b/pl_bolts/models/rl/common/networks.py
@@ -0,0 +1,317 @@
+"""Series of networks used
+Based on implementations found here:
+"""
+import math
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch import nn
+from torch.nn import functional as F
+
+
+class CNN(nn.Module):
+    """
+    Simple MLP network
+    Args:
+        input_shape: observation shape of the environment
+        n_actions: number of discrete actions available in the environment
+    """
+
+    def __init__(self, input_shape, n_actions):
+        super(CNN, self).__init__()
+
+        self.conv = nn.Sequential(
+            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1),
+            nn.ReLU(),
+        )
+
+        conv_out_size = self._get_conv_out(input_shape)
+        self.head = nn.Sequential(
+            nn.Linear(conv_out_size, 512), nn.ReLU(), nn.Linear(512, n_actions)
+        )
+
+    def _get_conv_out(self, shape) -> int:
+        """
+        Calculates the output size of the last conv layer
+        Args:
+            shape: input dimensions
+        Returns:
+            size of the conv output
+        """
+        conv_out = self.conv(torch.zeros(1, *shape))
+        return int(np.prod(conv_out.size()))
+
+    def forward(self, input_x) -> Tensor:
+        """
+        Forward pass through network
+        Args:
+            x: input to network
+        Returns:
+            output of network
+        """
+        conv_out = self.conv(input_x).view(input_x.size()[0], -1)
+        return self.head(conv_out)
+
+
+class MLP(nn.Module):
+    """
+    Simple MLP network
+    Args:
+        input_shape: observation shape of the environment
+        n_actions: number of discrete actions available in the environment
+        hidden_size: size of hidden layers
+    """
+
+    def __init__(self, input_shape: Tuple, n_actions: int, hidden_size: int = 128):
+        super(MLP, self).__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_shape[0], hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, n_actions),
+        )
+
+    def forward(self, input_x):
+        """
+        Forward pass through network
+        Args:
+            x: input to network
+        Returns:
+            output of network
+        """
+        return self.net(input_x.float())
+
+
+class DuelingMLP(nn.Module):
+    """
+    MLP network with duel heads for val and advantage
+    Args:
+        input_shape: observation shape of the environment
+        n_actions: number of discrete actions available in the environment
+        hidden_size: size of hidden layers
+    """
+
+    def __init__(self, input_shape: Tuple, n_actions: int, hidden_size: int = 128):
+        super(DuelingMLP, self).__init__()
+
+        self.net = nn.Sequential(
+            nn.Linear(input_shape[0], hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+
+        self.head_adv = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, n_actions),
+        )
+        self.head_val = nn.Sequential(
+            nn.Linear(hidden_size, 256), nn.ReLU(), nn.Linear(256, 1)
+        )
+
+    def forward(self, input_x):
+        """
+        Forward pass through network. Calculates the Q using the value and advantage
+        Args:
+            x: input to network
+        Returns:
+            Q value
+        """
+        adv, val = self.adv_val(input_x)
+        q_val = val + (adv - adv.mean(dim=1, keepdim=True))
+        return q_val
+
+    def adv_val(self, input_x) -> Tuple[Tensor, Tensor]:
+        """
+        Gets the advantage and value by passing out of the base network through the
+        value and advantage heads
+        Args:
+            input_x: input to network
+        Returns:
+            advantage, value
+        """
+        float_x = input_x.float()
+        base_out = self.net(float_x)
+        return self.fc_adv(base_out), self.fc_val(base_out)
+
+
+class DuelingCNN(nn.Module):
+    """
+    CNN network with duel heads for val and advantage
+    Args:
+        input_shape: observation shape of the environment
+        n_actions: number of discrete actions available in the environment
+        hidden_size: size of hidden layers
+    """
+
+    def __init__(self, input_shape: Tuple, n_actions: int, _: int = 128):
+
+        super().__init__()
+
+        self.conv = nn.Sequential(
+            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1),
+            nn.ReLU(),
+        )
+
+        conv_out_size = self._get_conv_out(input_shape)
+
+        # advantage head
+        self.head_adv = nn.Sequential(
+            nn.Linear(conv_out_size, 256), nn.ReLU(), nn.Linear(256, n_actions)
+        )
+
+        # value head
+        self.head_val = nn.Sequential(
+            nn.Linear(conv_out_size, 256), nn.ReLU(), nn.Linear(256, 1)
+        )
+
+    def _get_conv_out(self, shape) -> int:
+        """
+        Calculates the output size of the last conv layer
+        Args:
+            shape: input dimensions
+        Returns:
+            size of the conv output
+        """
+        conv_out = self.conv(torch.zeros(1, *shape))
+        return int(np.prod(conv_out.size()))
+
+    def forward(self, input_x):
+        """
+        Forward pass through network. Calculates the Q using the value and advantage
+        Args:
+            input_x: input to network
+        Returns:
+            Q value
+        """
+        adv, val = self.adv_val(input_x)
+        q_val = val + (adv - adv.mean(dim=1, keepdim=True))
+        return q_val
+
+    def adv_val(self, input_x):
+        """
+        Gets the advantage and value by passing out of the base network through the
+        value and advantage heads
+        Args:
+            input_x: input to network
+        Returns:
+            advantage, value
+        """
+        float_x = input_x.float()
+        base_out = self.conv(input_x).view(float_x.size()[0], -1)
+        return self.head_adv(base_out), self.head_val(base_out)
+
+
+class NoisyCNN(nn.Module):
+    """
+    CNN with Noisy Linear layers for exploration
+    Args:
+        input_shape: observation shape of the environment
+        n_actions: number of discrete actions available in the environment
+    """
+
+    def __init__(self, input_shape, n_actions):
+        super().__init__()
+
+        self.conv = nn.Sequential(
+            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1),
+            nn.ReLU(),
+        )
+
+        conv_out_size = self._get_conv_out(input_shape)
+        self.head = nn.Sequential(
+            NoisyLinear(conv_out_size, 512), nn.ReLU(), NoisyLinear(512, n_actions)
+        )
+
+    def _get_conv_out(self, shape) -> int:
+        """
+        Calculates the output size of the last conv layer
+        Args:
+            shape: input dimensions
+        Returns:
+            size of the conv output
+        """
+        conv_out = self.conv(torch.zeros(1, *shape))
+        return int(np.prod(conv_out.size()))
+
+    def forward(self, input_x) -> Tensor:
+        """
+        Forward pass through network
+        Args:
+            x: input to network
+        Returns:
+            output of network
+        """
+        conv_out = self.conv(input_x).view(input_x.size()[0], -1)
+        return self.head(conv_out)
+
+
+###################
+#  Custom Layers  #
+###################
+
+
+class NoisyLinear(nn.Linear):
+    """
+    Noisy Layer using Independent Gaussian Noise.
+    based on https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/
+    Chapter08/lib/dqn_extra.py#L19
+    Args:
+        in_features: number of inputs
+        out_features: number of outputs
+        sigma_init: initial fill value of noisy weights
+        bias: flag to include bias to linear layer
+    """
+
+    def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
+        super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
+
+        weights = torch.full((out_features, in_features), sigma_init)
+        self.sigma_weight = nn.Parameter(weights)
+        epsilon_weight = torch.zeros(out_features, in_features)
+        self.register_buffer("epsilon_weight", epsilon_weight)
+
+        if bias:
+            bias = torch.full((out_features,), sigma_init)
+            self.sigma_bias = nn.Parameter(bias)
+            epsilon_bias = torch.zeros(out_features)
+            self.register_buffer("epsilon_bias", epsilon_bias)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """initializes or resets the paramseter of the layer"""
+        std = math.sqrt(3 / self.in_features)
+        self.weight.data.uniform_(-std, std)
+        self.bias.data.uniform_(-std, std)
+
+    def forward(self, input_x: Tensor) -> Tensor:
+        """
+        Forward pass of the layer
+        Args:
+            input_x: input tensor
+        Returns:
+            output of the layer
+        """
+        self.epsilon_weight.normal_()
+        bias = self.bias
+        if bias is not None:
+            self.epsilon_bias.normal_()
+            bias = bias + self.sigma_bias * self.epsilon_bias.data
+
+        noisy_weights = self.sigma_weight * self.epsilon_weight.data + self.weight
+
+        return F.linear(input_x, noisy_weights, bias)
diff --git a/pl_bolts/models/rl/double_dqn_model.py b/pl_bolts/models/rl/double_dqn_model.py
new file mode 100644
index 0000000000..f31ae16c6d
--- /dev/null
+++ b/pl_bolts/models/rl/double_dqn_model.py
@@ -0,0 +1,123 @@
+"""
+Double DQN
+"""
+import argparse
+from collections import OrderedDict
+from typing import Tuple
+
+import pytorch_lightning as pl
+import torch
+
+from pl_bolts.losses.rl import double_dqn_loss
+from pl_bolts.models.rl.dqn_model import DQN
+
+
+class DoubleDQN(DQN):
+    """
+    Double Deep Q-network (DDQN)
+    PyTorch Lightning implementation of `Double DQN <https://arxiv.org/pdf/1509.06461.pdf>`_
+
+    Paper authors: Hado van Hasselt, Arthur Guez, David Silver
+
+    Model implemented by:
+
+        - `Donal Byrne <https://github.com/djbyrne>`
+
+    Example:
+
+        >>> from pl_bolts.models.rl.double_dqn_model import DoubleDQN
+        ...
+        >>> model = DoubleDQN("PongNoFrameskip-v4")
+
+    Train::
+
+        trainer = Trainer()
+        trainer.fit(model)
+
+    Args:
+        env: gym environment tag
+        gpus: number of gpus being used
+        eps_start: starting value of epsilon for the epsilon-greedy exploration
+        eps_end: final value of epsilon for the epsilon-greedy exploration
+        eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end
+        sync_rate: the number of iterations between syncing up the target network with the train network
+        gamma: discount factor
+        lr: learning rate
+        batch_size: size of minibatch pulled from the DataLoader
+        replay_size: total capacity of the replay buffer
+        warm_start_size: how many random steps through the environment to be carried out at the start of
+            training to fill the buffer with a starting point
+        sample_len: the number of samples to pull from the dataset iterator and feed to the DataLoader
+
+    Note:
+        This example is based on
+        https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter08/03_dqn_double.py
+
+    Note:
+        Currently only supports CPU and single GPU training with `distributed_backend=dp`
+    """
+
+    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict:
+        """
+        Carries out a single step through the environment to update the replay buffer.
+        Then calculates loss based on the minibatch recieved
+        Args:
+            batch: current mini batch of replay data
+            _: batch number, not used
+        Returns:
+            Training loss and log metrics
+        """
+
+        # calculates training loss
+        loss = double_dqn_loss(batch, self.net, self.target_net)
+
+        if self.trainer.use_dp or self.trainer.use_ddp2:
+            loss = loss.unsqueeze(0)
+
+        # Soft update of target network
+        if self.global_step % self.sync_rate == 0:
+            self.target_net.load_state_dict(self.net.state_dict())
+
+        log = {
+            "total_reward": self.total_rewards[-1],
+            "avg_reward": self.avg_rewards,
+            "train_loss": loss,
+            # "episodes": self.total_episode_steps,
+        }
+        status = {
+            "steps": self.global_step,
+            "avg_reward": self.avg_rewards,
+            "total_reward": self.total_rewards[-1],
+            "episodes": self.done_episodes,
+            # "episode_steps": self.episode_steps,
+            "epsilon": self.agent.epsilon,
+        }
+
+        return OrderedDict(
+            {
+                "loss": loss,
+                "avg_reward": self.avg_rewards,
+                "log": log,
+                "progress_bar": status,
+            }
+        )
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(add_help=False)
+
+    # trainer args
+    parser = pl.Trainer.add_argparse_args(parser)
+
+    # model args
+    parser = DoubleDQN.add_model_specific_args(parser)
+    args = parser.parse_args()
+
+    model = DoubleDQN(**args.__dict__)
+
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/pl_bolts/models/rl/dqn_model.py b/pl_bolts/models/rl/dqn_model.py
new file mode 100644
index 0000000000..01b4a68277
--- /dev/null
+++ b/pl_bolts/models/rl/dqn_model.py
@@ -0,0 +1,443 @@
+"""
+Deep Q Network
+"""
+
+import argparse
+from collections import OrderedDict
+from typing import Tuple, List, Dict
+from warnings import warn
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.optim as optim
+from pytorch_lightning import seed_everything
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.optim.optimizer import Optimizer
+from torch.utils.data import DataLoader
+
+from pl_bolts.datamodules.experience_source import ExperienceSourceDataset, Experience
+from pl_bolts.losses.rl import dqn_loss
+from pl_bolts.models.rl.common.agents import ValueAgent
+from pl_bolts.models.rl.common.memory import MultiStepBuffer
+from pl_bolts.models.rl.common.networks import CNN
+try:
+    from pl_bolts.models.rl.common.gym_wrappers import gym, make_environment
+except ModuleNotFoundError:
+    warn('You want to use `gym` which is not installed yet,'  # pragma: no-cover
+         ' install it with `pip install gym`.')
+    _GYM_AVAILABLE = False
+else:
+    _GYM_AVAILABLE = True
+
+
+class DQN(pl.LightningModule):
+    """ Basic DQN Model """
+
+    def __init__(
+        self,
+        env: str,
+        eps_start: float = 1.0,
+        eps_end: float = 0.02,
+        eps_last_frame: int = 150000,
+        sync_rate: int = 1000,
+        gamma: float = 0.99,
+        learning_rate: float = 1e-4,
+        batch_size: int = 32,
+        replay_size: int = 100000,
+        warm_start_size: int = 10000,
+        avg_reward_len: int = 100,
+        min_episode_reward: int = -21,
+        seed: int = 123,
+        batches_per_epoch: int = 1000,
+        n_steps: int = 1,
+        **kwargs,
+    ):
+        """
+        PyTorch Lightning implementation of `DQN <https://arxiv.org/abs/1312.5602>`_
+        Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves,
+        Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller.
+        Model implemented by:
+
+            - `Donal Byrne <https://github.com/djbyrne>`
+
+        Example:
+            >>> from pl_bolts.models.rl.dqn_model import DQN
+            ...
+            >>> model = DQN("PongNoFrameskip-v4")
+
+        Train::
+
+            trainer = Trainer()
+            trainer.fit(model)
+
+        Args:
+            env: gym environment tag
+            eps_start: starting value of epsilon for the epsilon-greedy exploration
+            eps_end: final value of epsilon for the epsilon-greedy exploration
+            eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end
+            sync_rate: the number of iterations between syncing up the target network with the train network
+            gamma: discount factor
+            learning_rate: learning rate
+            batch_size: size of minibatch pulled from the DataLoader
+            replay_size: total capacity of the replay buffer
+            warm_start_size: how many random steps through the environment to be carried out at the start of
+                training to fill the buffer with a starting point
+            avg_reward_len: how many episodes to take into account when calculating the avg reward
+            min_episode_reward: the minimum score that can be achieved in an episode. Used for filling the avg buffer
+                before training begins
+            seed: seed value for all RNG used
+            batches_per_epoch: number of batches per epoch
+            n_steps: size of n step look ahead
+
+        Note:
+            This example is based on:
+            https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter06/02_dqn_pong.py
+
+        Note:
+            Currently only supports CPU and single GPU training with `distributed_backend=dp`
+        """
+        super().__init__()
+
+        # Environment
+        self.exp = None
+        self.env = self.make_environment(env, seed)
+        self.test_env = self.make_environment(env)
+
+        self.obs_shape = self.env.observation_space.shape
+        self.n_actions = self.env.action_space.n
+
+        # Model Attributes
+        self.buffer = None
+        self.dataset = None
+
+        self.net = None
+        self.target_net = None
+        self.build_networks()
+
+        self.agent = ValueAgent(
+            self.net,
+            self.n_actions,
+            eps_start=eps_start,
+            eps_end=eps_end,
+            eps_frames=eps_last_frame,
+        )
+
+        # Hyperparameters
+        self.sync_rate = sync_rate
+        self.gamma = gamma
+        self.lr = learning_rate
+        self.batch_size = batch_size
+        self.replay_size = replay_size
+        self.warm_start_size = warm_start_size
+        self.batches_per_epoch = batches_per_epoch
+        self.n_steps = n_steps
+
+        self.save_hyperparameters()
+
+        # Metrics
+        self.total_episode_steps = [0]
+        self.total_rewards = [0]
+        self.done_episodes = 0
+        self.total_steps = 0
+
+        # Average Rewards
+        self.avg_reward_len = avg_reward_len
+
+        for _ in range(avg_reward_len):
+            self.total_rewards.append(
+                torch.tensor(min_episode_reward, device=self.device)
+            )
+
+        self.avg_rewards = float(
+            np.mean(self.total_rewards[-self.avg_reward_len:])
+        )
+
+        self.state = self.env.reset()
+
+    def run_n_episodes(self, env, n_epsiodes: int = 1, epsilon: float = 1.0) -> List[int]:
+        """
+        Carries out N episodes of the environment with the current agent
+        Args:
+            env: environment to use, either train environment or test environment
+            n_epsiodes: number of episodes to run
+            epsilon: epsilon value for DQN agent
+        """
+        total_rewards = []
+
+        for _ in range(n_epsiodes):
+            episode_state = env.reset()
+            done = False
+            episode_reward = 0
+
+            while not done:
+                self.agent.epsilon = epsilon
+                action = self.agent(episode_state, self.device)
+                next_state, reward, done, _ = self.env.step(action[0])
+                episode_state = next_state
+                episode_reward += reward
+
+            total_rewards.append(episode_reward)
+
+        return total_rewards
+
+    def populate(self, warm_start: int) -> None:
+        """Populates the buffer with initial experience"""
+        if warm_start > 0:
+            self.state = self.env.reset()
+
+            for _ in range(warm_start):
+                self.agent.epsilon = 1.0
+                action = self.agent(self.state, self.device)
+                next_state, reward, done, _ = self.env.step(action[0])
+                exp = Experience(state=self.state, action=action[0], reward=reward, done=done, new_state=next_state)
+                self.buffer.append(exp)
+                self.state = next_state
+
+                if done:
+                    self.state = self.env.reset()
+
+    def build_networks(self) -> None:
+        """Initializes the DQN train and target networks"""
+        self.net = CNN(self.obs_shape, self.n_actions)
+        self.target_net = CNN(self.obs_shape, self.n_actions)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Passes in a state x through the network and gets the q_values of each action as an output
+        Args:
+            x: environment state
+        Returns:
+            q values
+        """
+        output = self.net(x)
+        return output
+
+    def train_batch(
+        self,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Contains the logic for generating a new batch of data to be passed to the DataLoader
+        Returns:
+            yields a Experience tuple containing the state, action, reward, done and next_state.
+        """
+        episode_reward = 0
+        episode_steps = 0
+
+        while True:
+            self.total_steps += 1
+            action = self.agent(self.state, self.device)
+
+            next_state, r, is_done, _ = self.env.step(action[0])
+
+            episode_reward += r
+            episode_steps += 1
+
+            exp = Experience(state=self.state, action=action[0], reward=r, done=is_done, new_state=next_state)
+
+            self.agent.update_epsilon(self.global_step)
+            self.buffer.append(exp)
+            self.state = next_state
+
+            if is_done:
+                self.done_episodes += 1
+                self.total_rewards.append(episode_reward)
+                self.total_episode_steps.append(episode_steps)
+                self.avg_rewards = float(
+                    np.mean(self.total_rewards[-self.avg_reward_len:])
+                )
+                self.state = self.env.reset()
+                episode_steps = 0
+                episode_reward = 0
+
+            states, actions, rewards, dones, new_states = self.buffer.sample(self.batch_size)
+
+            for idx, _ in enumerate(dones):
+                yield states[idx], actions[idx], rewards[idx], dones[idx], new_states[idx]
+
+            # Simulates epochs
+            if self.total_steps % self.batches_per_epoch == 0:
+                break
+
+    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict:
+        """
+        Carries out a single step through the environment to update the replay buffer.
+        Then calculates loss based on the minibatch recieved
+        Args:
+            batch: current mini batch of replay data
+            _: batch number, not used
+        Returns:
+            Training loss and log metrics
+        """
+
+        # calculates training loss
+        loss = dqn_loss(batch, self.net, self.target_net)
+
+        if self.trainer.use_dp or self.trainer.use_ddp2:
+            loss = loss.unsqueeze(0)
+
+        # Soft update of target network
+        if self.global_step % self.sync_rate == 0:
+            self.target_net.load_state_dict(self.net.state_dict())
+
+        log = {
+            "total_reward": self.total_rewards[-1],
+            "avg_reward": self.avg_rewards,
+            "train_loss": loss,
+            "episodes": self.done_episodes,
+            "episode_steps": self.total_episode_steps[-1]
+        }
+        status = {
+            "steps": self.global_step,
+            "avg_reward": self.avg_rewards,
+            "total_reward": self.total_rewards[-1],
+            "episodes": self.done_episodes,
+            "episode_steps": self.total_episode_steps[-1],
+            "epsilon": self.agent.epsilon,
+        }
+
+        return OrderedDict(
+            {
+                "loss": loss,
+                "avg_reward": self.avg_rewards,
+                "log": log,
+                "progress_bar": status,
+            }
+        )
+
+    def test_step(self, *args, **kwargs) -> Dict[str, torch.Tensor]:
+        """Evaluate the agent for 10 episodes"""
+        test_reward = self.run_n_episodes(self.test_env, 1, 0)
+        avg_reward = sum(test_reward) / len(test_reward)
+        return {"test_reward": avg_reward}
+
+    def test_epoch_end(self, outputs) -> Dict[str, torch.Tensor]:
+        """Log the avg of the test results"""
+        rewards = [x["test_reward"] for x in outputs]
+        avg_reward = sum(rewards) / len(rewards)
+        tensorboard_logs = {"avg_test_reward": avg_reward}
+        return {"avg_test_reward": avg_reward, "log": tensorboard_logs}
+
+    def configure_optimizers(self) -> List[Optimizer]:
+        """ Initialize Adam optimizer"""
+        optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
+        return [optimizer]
+
+    def _dataloader(self) -> DataLoader:
+        """Initialize the Replay Buffer dataset used for retrieving experiences"""
+        self.buffer = MultiStepBuffer(self.replay_size, self.n_steps)
+        self.populate(self.warm_start_size)
+
+        self.dataset = ExperienceSourceDataset(self.train_batch)
+        return DataLoader(dataset=self.dataset, batch_size=self.batch_size)
+
+    def train_dataloader(self) -> DataLoader:
+        """Get train loader"""
+        return self._dataloader()
+
+    def test_dataloader(self) -> DataLoader:
+        """Get test loader"""
+        return self._dataloader()
+
+    @staticmethod
+    def make_environment(env_name: str, seed: int = None) -> gym.Env:
+        """
+        Initialise gym  environment
+        Args:
+            env_name: environment name or tag
+            seed: value to seed the environment RNG for reproducibility
+        Returns:
+            gym environment
+        """
+        env = make_environment(env_name)
+
+        if seed:
+            env.seed(seed)
+
+        return env
+
+    @staticmethod
+    def add_model_specific_args(
+        arg_parser: argparse.ArgumentParser,
+    ) -> argparse.ArgumentParser:
+        """
+        Adds arguments for DQN model
+        Note: these params are fine tuned for Pong env
+        Args:
+            arg_parser: parent parser
+        """
+        arg_parser.add_argument(
+            "--sync_rate",
+            type=int,
+            default=1000,
+            help="how many frames do we update the target network",
+        )
+        arg_parser.add_argument(
+            "--replay_size",
+            type=int,
+            default=100000,
+            help="capacity of the replay buffer",
+        )
+        arg_parser.add_argument(
+            "--warm_start_size",
+            type=int,
+            default=10000,
+            help="how many samples do we use to fill our buffer at the start of training",
+        )
+        arg_parser.add_argument(
+            "--eps_last_frame",
+            type=int,
+            default=150000,
+            help="what frame should epsilon stop decaying",
+        )
+        arg_parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon")
+        arg_parser.add_argument("--eps_end", type=float, default=0.02, help="final value of epsilon")
+        arg_parser.add_argument("--batches_per_epoch", type=int, default=10000, help="number of batches in an epoch")
+        arg_parser.add_argument("--batch_size", type=int, default=32, help="size of the batches")
+        arg_parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
+
+        arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag")
+        arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor")
+
+        arg_parser.add_argument(
+            "--avg_reward_len",
+            type=int,
+            default=100,
+            help="how many episodes to include in avg reward",
+        )
+        arg_parser.add_argument(
+            "--n_steps",
+            type=int,
+            default=1,
+            help="how many frames do we update the target network",
+        )
+
+        return arg_parser
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(add_help=False)
+
+    # trainer args
+    parser = pl.Trainer.add_argparse_args(parser)
+
+    # model args
+    parser = DQN.add_model_specific_args(parser)
+    args = parser.parse_args()
+
+    model = DQN(**args.__dict__)
+
+    # save checkpoints based on avg_reward
+    checkpoint_callback = ModelCheckpoint(
+        save_top_k=1, monitor="avg_reward", mode="max", period=1, verbose=True
+    )
+
+    seed_everything(123)
+    trainer = pl.Trainer.from_argparse_args(
+        args, deterministic=True, checkpoint_callback=checkpoint_callback)
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/pl_bolts/models/rl/dueling_dqn_model.py b/pl_bolts/models/rl/dueling_dqn_model.py
new file mode 100644
index 0000000000..79afca2fc7
--- /dev/null
+++ b/pl_bolts/models/rl/dueling_dqn_model.py
@@ -0,0 +1,75 @@
+"""
+Dueling DQN
+"""
+import argparse
+
+import pytorch_lightning as pl
+
+from pl_bolts.models.rl.common.networks import DuelingCNN
+from pl_bolts.models.rl.dqn_model import DQN
+
+
+class DuelingDQN(DQN):
+    """
+        PyTorch Lightning implementation of `Dueling DQN <https://arxiv.org/abs/1511.06581>`_
+
+        Paper authors: Ziyu Wang, Tom Schaul, Matteo Hessel, Hado van Hasselt, Marc Lanctot, Nando de Freitas
+
+        Model implemented by:
+
+            - `Donal Byrne <https://github.com/djbyrne>`
+
+        Example:
+
+            >>> from pl_bolts.models.rl.dueling_dqn_model import DuelingDQN
+            ...
+            >>> model = DuelingDQN("PongNoFrameskip-v4")
+
+        Train::
+
+            trainer = Trainer()
+            trainer.fit(model)
+
+        Args:
+            env: gym environment tag
+            gpus: number of gpus being used
+            eps_start: starting value of epsilon for the epsilon-greedy exploration
+            eps_end: final value of epsilon for the epsilon-greedy exploration
+            eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end
+            sync_rate: the number of iterations between syncing up the target network with the train network
+            gamma: discount factor
+            lr: learning rate
+            batch_size: size of minibatch pulled from the DataLoader
+            replay_size: total capacity of the replay buffer
+            warm_start_size: how many random steps through the environment to be carried out at the start of
+                training to fill the buffer with a starting point
+            sample_len: the number of samples to pull from the dataset iterator and feed to the DataLoader
+
+        .. note:: Currently only supports CPU and single GPU training with `distributed_backend=dp`
+
+        """
+
+    def build_networks(self) -> None:
+        """Initializes the Dueling DQN train and target networks"""
+        self.net = DuelingCNN(self.obs_shape, self.n_actions)
+        self.target_net = DuelingCNN(self.obs_shape, self.n_actions)
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(add_help=False)
+
+    # trainer args
+    parser = pl.Trainer.add_argparse_args(parser)
+
+    # model args
+    parser = DuelingDQN.add_model_specific_args(parser)
+    args = parser.parse_args()
+
+    model = DuelingDQN(**args.__dict__)
+
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/pl_bolts/models/rl/noisy_dqn_model.py b/pl_bolts/models/rl/noisy_dqn_model.py
new file mode 100644
index 0000000000..26f960c117
--- /dev/null
+++ b/pl_bolts/models/rl/noisy_dqn_model.py
@@ -0,0 +1,130 @@
+"""
+Noisy DQN
+"""
+import argparse
+from typing import Tuple
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+
+from pl_bolts.datamodules.experience_source import Experience
+from pl_bolts.models.rl.common.networks import NoisyCNN
+from pl_bolts.models.rl.dqn_model import DQN
+
+
+class NoisyDQN(DQN):
+    """
+    PyTorch Lightning implementation of `Noisy DQN <https://arxiv.org/abs/1706.10295>`_
+
+    Paper authors: Meire Fortunato, Mohammad Gheshlaghi Azar, Bilal Piot, Jacob Menick, Ian Osband, Alex Graves,
+    Vlad Mnih, Remi Munos, Demis Hassabis, Olivier Pietquin, Charles Blundell, Shane Legg
+
+    Model implemented by:
+
+        - `Donal Byrne <https://github.com/djbyrne>`
+
+    Example:
+        >>> from pl_bolts.models.rl.noisy_dqn_model import NoisyDQN
+        ...
+        >>> model = NoisyDQN("PongNoFrameskip-v4")
+
+    Train::
+
+        trainer = Trainer()
+        trainer.fit(model)
+
+    Args:
+        env: gym environment tag
+        gpus: number of gpus being used
+        eps_start: starting value of epsilon for the epsilon-greedy exploration
+        eps_end: final value of epsilon for the epsilon-greedy exploration
+        eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end
+        sync_rate: the number of iterations between syncing up the target network with the train network
+        gamma: discount factor
+        lr: learning rate
+        batch_size: size of minibatch pulled from the DataLoader
+        replay_size: total capacity of the replay buffer
+        warm_start_size: how many random steps through the environment to be carried out at the start of
+        training to fill the buffer with a starting point
+        sample_len: the number of samples to pull from the dataset iterator and feed to the DataLoader
+
+    .. note:: Currently only supports CPU and single GPU training with `distributed_backend=dp`
+
+    """
+
+    def build_networks(self) -> None:
+        """Initializes the Noisy DQN train and target networks"""
+        self.net = NoisyCNN(self.obs_shape, self.n_actions)
+        self.target_net = NoisyCNN(self.obs_shape, self.n_actions)
+
+    def on_train_start(self) -> None:
+        """Set the agents epsilon to 0 as the exploration comes from the network"""
+        self.agent.epsilon = 0.0
+
+    def train_batch(
+            self,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Contains the logic for generating a new batch of data to be passed to the DataLoader.
+        This is the same function as the standard DQN except that we dont update epsilon as it is always 0. The
+        exploration comes from the noisy network.
+        Returns:
+            yields a Experience tuple containing the state, action, reward, done and next_state.
+        """
+        episode_reward = 0
+        episode_steps = 0
+
+        while True:
+            self.total_steps += 1
+            action = self.agent(self.state, self.device)
+
+            next_state, r, is_done, _ = self.env.step(action[0])
+
+            episode_reward += r
+            episode_steps += 1
+
+            exp = Experience(state=self.state, action=action[0], reward=r, done=is_done, new_state=next_state)
+
+            self.buffer.append(exp)
+            self.state = next_state
+
+            if is_done:
+                self.done_episodes += 1
+                self.total_rewards.append(episode_reward)
+                self.total_episode_steps.append(episode_steps)
+                self.avg_rewards = float(
+                    np.mean(self.total_rewards[-self.avg_reward_len:])
+                )
+                self.state = self.env.reset()
+                episode_steps = 0
+                episode_reward = 0
+
+            states, actions, rewards, dones, new_states = self.buffer.sample(self.batch_size)
+
+            for idx, _ in enumerate(dones):
+                yield states[idx], actions[idx], rewards[idx], dones[idx], new_states[idx]
+
+            # Simulates epochs
+            if self.total_steps % self.batches_per_epoch == 0:
+                break
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(add_help=False)
+
+    # trainer args
+    parser = pl.Trainer.add_argparse_args(parser)
+
+    # model args
+    parser = NoisyDQN.add_model_specific_args(parser)
+    args = parser.parse_args()
+
+    model = NoisyDQN(**args.__dict__)
+
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/pl_bolts/models/rl/per_dqn_model.py b/pl_bolts/models/rl/per_dqn_model.py
new file mode 100644
index 0000000000..07ad80d564
--- /dev/null
+++ b/pl_bolts/models/rl/per_dqn_model.py
@@ -0,0 +1,196 @@
+"""
+Prioritized Experience Replay DQN
+"""
+import argparse
+from collections import OrderedDict
+from typing import Tuple
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from torch.utils.data import DataLoader
+
+from pl_bolts.datamodules import ExperienceSourceDataset
+from pl_bolts.losses.rl import per_dqn_loss
+from pl_bolts.models.rl.common.memory import PERBuffer, Experience
+from pl_bolts.models.rl.dqn_model import DQN
+
+
+class PERDQN(DQN):
+    """
+    PyTorch Lightning implementation of `DQN With Prioritized Experience Replay <https://arxiv.org/abs/1511.05952>`_
+
+    Paper authors: Tom Schaul, John Quan, Ioannis Antonoglou, David Silver
+
+    Model implemented by:
+
+        - `Donal Byrne <https://github.com/djbyrne>`
+
+    Example:
+
+            >>> from pl_bolts.models.rl.per_dqn_model import PERDQN
+            ...
+            >>> model = PERDQN("PongNoFrameskip-v4")
+
+    Train::
+
+        trainer = Trainer()
+        trainer.fit(model)
+
+        Args:
+            env: gym environment tag
+            gpus: number of gpus being used
+            eps_start: starting value of epsilon for the epsilon-greedy exploration
+            eps_end: final value of epsilon for the epsilon-greedy exploration
+            eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end
+            sync_rate: the number of iterations between syncing up the target network with the train network
+            gamma: discount factor
+            learning_rate: learning rate
+            batch_size: size of minibatch pulled from the DataLoader
+            replay_size: total capacity of the replay buffer
+            warm_start_size: how many random steps through the environment to be carried out at the start of
+                training to fill the buffer with a starting point
+            num_samples: the number of samples to pull from the dataset iterator and feed to the DataLoader
+
+        .. note::
+            This example is based on:
+             https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter08/05_dqn_prio_replay.py
+
+        .. note:: Currently only supports CPU and single GPU training with `distributed_backend=dp`
+
+        """
+
+    def train_batch(
+        self,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Contains the logic for generating a new batch of data to be passed to the DataLoader
+        Returns:
+            yields a Experience tuple containing the state, action, reward, done and next_state.
+        """
+
+        episode_reward = 0
+        episode_steps = 0
+
+        while True:
+            self.total_steps += 1
+            action = self.agent(self.state, self.device)
+
+            next_state, r, is_done, _ = self.env.step(action[0])
+
+            episode_reward += r
+            episode_steps += 1
+
+            exp = Experience(
+                state=self.state,
+                action=action[0],
+                reward=r,
+                done=is_done,
+                new_state=next_state,
+            )
+
+            self.agent.update_epsilon(self.global_step)
+            self.buffer.append(exp)
+            self.state = next_state
+
+            if is_done:
+                self.done_episodes += 1
+                self.total_rewards.append(episode_reward)
+                self.total_episode_steps.append(episode_steps)
+                self.avg_rewards = float(
+                    np.mean(self.total_rewards[-self.avg_reward_len:])
+                )
+                self.state = self.env.reset()
+                episode_steps = 0
+                episode_reward = 0
+
+            samples, indices, weights = self.buffer.sample(self.batch_size)
+
+            states, actions, rewards, dones, new_states = samples
+
+            for idx, _ in enumerate(dones):
+                yield (
+                    states[idx],
+                    actions[idx],
+                    rewards[idx],
+                    dones[idx],
+                    new_states[idx],
+                ), indices[idx], weights[idx]
+
+    def training_step(self, batch, _) -> OrderedDict:
+        """
+        Carries out a single step through the environment to update the replay buffer.
+        Then calculates loss based on the minibatch recieved
+        Args:
+            batch: current mini batch of replay data
+            _: batch number, not used
+        Returns:
+            Training loss and log metrics
+        """
+        samples, indices, weights = batch
+        indices = indices.cpu().numpy()
+
+        # calculates training loss
+        loss, batch_weights = per_dqn_loss(samples, weights, self.net, self.target_net)
+
+        if self.trainer.use_dp or self.trainer.use_ddp2:
+            loss = loss.unsqueeze(0)
+
+        # update priorities in buffer
+        self.buffer.update_priorities(indices, batch_weights)
+
+        # update of target network
+        if self.global_step % self.sync_rate == 0:
+            self.target_net.load_state_dict(self.net.state_dict())
+
+        log = {
+            "total_reward": self.total_rewards[-1],
+            "avg_reward": self.avg_rewards,
+            "train_loss": loss,
+            # "episodes": self.total_episode_steps,
+        }
+        status = {
+            "steps": self.global_step,
+            "avg_reward": self.avg_rewards,
+            "total_reward": self.total_rewards[-1],
+            "episodes": self.done_episodes,
+            # "episode_steps": self.episode_steps,
+            "epsilon": self.agent.epsilon,
+        }
+
+        return OrderedDict(
+            {
+                "loss": loss,
+                "avg_reward": self.avg_rewards,
+                "log": log,
+                "progress_bar": status,
+            }
+        )
+
+    def _dataloader(self) -> DataLoader:
+        """Initialize the Replay Buffer dataset used for retrieving experiences"""
+        self.buffer = PERBuffer(self.replay_size)
+        self.populate(self.warm_start_size)
+
+        self.dataset = ExperienceSourceDataset(self.train_batch)
+        return DataLoader(dataset=self.dataset, batch_size=self.batch_size)
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(add_help=False)
+
+    # trainer args
+    parser = pl.Trainer.add_argparse_args(parser)
+
+    # model args
+    parser = PERDQN.add_model_specific_args(parser)
+    args = parser.parse_args()
+
+    model = PERDQN(**args.__dict__)
+
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/pl_bolts/models/rl/reinforce_model.py b/pl_bolts/models/rl/reinforce_model.py
new file mode 100644
index 0000000000..55535a91e7
--- /dev/null
+++ b/pl_bolts/models/rl/reinforce_model.py
@@ -0,0 +1,318 @@
+import argparse
+from collections import OrderedDict
+from typing import Tuple, List
+from warnings import warn
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.optim as optim
+from pytorch_lightning import seed_everything
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.nn.functional import log_softmax
+from torch.optim.optimizer import Optimizer
+from torch.utils.data import DataLoader
+
+from pl_bolts.datamodules import ExperienceSourceDataset
+from pl_bolts.datamodules.experience_source import Experience
+from pl_bolts.models.rl.common.agents import PolicyAgent
+from pl_bolts.models.rl.common.networks import MLP
+try:
+    import gym
+except ModuleNotFoundError:
+    warn('You want to use `gym` which is not installed yet, install it with `pip install gym`.')  # pragma: no-cover
+    _GYM_AVAILABLE = False
+else:
+    _GYM_AVAILABLE = True
+
+
+class Reinforce(pl.LightningModule):
+    def __init__(
+        self,
+        env: str,
+        gamma: float = 0.99,
+        lr: float = 0.01,
+        batch_size: int = 8,
+        n_steps: int = 10,
+        avg_reward_len: int = 100,
+        entropy_beta: float = 0.01,
+        epoch_len: int = 1000,
+        num_batch_episodes: int = 4,
+        **kwargs
+    ) -> None:
+        """
+        PyTorch Lightning implementation of `REINFORCE
+        <https://papers.nips.cc/paper/
+        1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_
+        Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour
+        Model implemented by:
+
+            - `Donal Byrne <https://github.com/djbyrne>`
+
+        Example:
+            >>> from pl_bolts.models.rl.reinforce_model import Reinforce
+            ...
+            >>> model = Reinforce("CartPole-v0")
+
+        Train::
+
+            trainer = Trainer()
+            trainer.fit(model)
+
+        Args:
+            env: gym environment tag
+            gamma: discount factor
+            lr: learning rate
+            batch_size: size of minibatch pulled from the DataLoader
+            n_steps: number of stakes per discounted experience
+            entropy_beta: entropy coefficient
+            epoch_len: how many batches before pseudo epoch
+            num_batch_episodes: how many episodes to rollout for each batch of training
+            avg_reward_len: how many episodes to take into account when calculating the avg reward
+
+        Note:
+            This example is based on:
+            https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/02_cartpole_reinforce.py
+
+        Note:
+            Currently only supports CPU and single GPU training with `distributed_backend=dp`
+        """
+        super().__init__()
+
+        if not _GYM_AVAILABLE:
+            raise ModuleNotFoundError('This Module requires gym environment which is not installed yet.')
+
+        # Hyperparameters
+        self.lr = lr
+        self.batch_size = batch_size
+        self.batches_per_epoch = self.batch_size * epoch_len
+        self.entropy_beta = entropy_beta
+        self.gamma = gamma
+        self.n_steps = n_steps
+        self.num_batch_episodes = num_batch_episodes
+
+        self.save_hyperparameters()
+
+        # Model components
+        self.env = gym.make(env)
+        self.net = MLP(self.env.observation_space.shape, self.env.action_space.n)
+        self.agent = PolicyAgent(self.net)
+
+        # Tracking metrics
+        self.total_steps = 0
+        self.total_rewards = [0]
+        self.done_episodes = 0
+        self.avg_rewards = 0
+        self.reward_sum = 0.0
+        self.batch_episodes = 0
+        self.avg_reward_len = avg_reward_len
+
+        self.batch_states = []
+        self.batch_actions = []
+        self.batch_qvals = []
+        self.cur_rewards = []
+
+        self.state = self.env.reset()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Passes in a state x through the network and gets the q_values of each action as an output
+        Args:
+            x: environment state
+        Returns:
+            q values
+        """
+        output = self.net(x)
+        return output
+
+    def calc_qvals(self, rewards: List[float]) -> List[float]:
+        """Calculate the discounted rewards of all rewards in list
+        Args:
+            rewards: list of rewards from latest batch
+        Returns:
+            list of discounted rewards
+        """
+        assert isinstance(rewards[0], float)
+
+        cumul_reward = []
+        sum_r = 0.0
+
+        for r in reversed(rewards):
+            sum_r = (sum_r * self.gamma) + r
+            cumul_reward.append(sum_r)
+
+        return list(reversed(cumul_reward))
+
+    def discount_rewards(self, experiences: Tuple[Experience]) -> float:
+        """
+        Calculates the discounted reward over N experiences
+        Args:
+            experiences: Tuple of Experience
+        Returns:
+            total discounted reward
+        """
+        total_reward = 0.0
+        for exp in reversed(experiences):
+            total_reward = (self.gamma * total_reward) + exp.reward
+        return total_reward
+
+    def train_batch(
+            self,
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]:
+        """
+        Contains the logic for generating a new batch of data to be passed to the DataLoader
+        Yield:
+            yields a tuple of Lists containing tensors for states, actions and rewards of the batch.
+        """
+
+        while True:
+
+            action = self.agent(self.state, self.device)
+
+            next_state, reward, done, _ = self.env.step(action[0])
+
+            self.batch_states.append(self.state)
+            self.batch_actions.append(action[0])
+            self.cur_rewards.append(reward)
+
+            self.state = next_state
+            self.total_steps += 1
+
+            if done:
+                self.batch_qvals.extend(self.calc_qvals(self.cur_rewards))
+                self.batch_episodes += 1
+                self.done_episodes += 1
+                self.total_rewards.append(sum(self.cur_rewards))
+                self.avg_rewards = float(
+                    np.mean(self.total_rewards[-self.avg_reward_len:])
+                )
+                self.cur_rewards = []
+                self.state = self.env.reset()
+
+            if self.batch_episodes >= self.num_batch_episodes:
+                for state, action, qval in zip(
+                        self.batch_states, self.batch_actions, self.batch_qvals
+                ):
+                    yield state, action, qval
+
+                self.batch_episodes = 0
+
+            # Simulates epochs
+            if self.total_steps % self.batches_per_epoch == 0:
+                break
+
+    def loss(self, states, actions, scaled_rewards) -> torch.Tensor:
+        logits = self.net(states)
+
+        # policy loss
+        log_prob = log_softmax(logits, dim=1)
+        log_prob_actions = scaled_rewards * log_prob[range(self.batch_size), actions]
+        loss = -log_prob_actions.mean()
+
+        return loss
+
+    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict:
+        """
+        Carries out a single step through the environment to update the replay buffer.
+        Then calculates loss based on the minibatch recieved
+        Args:
+            batch: current mini batch of replay data
+            _: batch number, not used
+        Returns:
+            Training loss and log metrics
+        """
+        states, actions, scaled_rewards = batch
+
+        loss = self.loss(states, actions, scaled_rewards)
+
+        log = {
+            "episodes": self.done_episodes,
+            "reward": self.total_rewards[-1],
+            "avg_reward": self.avg_rewards,
+        }
+
+        return OrderedDict(
+            {
+                "loss": loss,
+                "avg_reward": self.avg_rewards,
+                "log": log,
+                "progress_bar": log,
+            }
+        )
+
+    def configure_optimizers(self) -> List[Optimizer]:
+        """ Initialize Adam optimizer"""
+        optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
+        return [optimizer]
+
+    def _dataloader(self) -> DataLoader:
+        """Initialize the Replay Buffer dataset used for retrieving experiences"""
+        dataset = ExperienceSourceDataset(self.train_batch)
+        dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size)
+        return dataloader
+
+    def train_dataloader(self) -> DataLoader:
+        """Get train loader"""
+        return self._dataloader()
+
+    def get_device(self, batch) -> str:
+        """Retrieve device currently being used by minibatch"""
+        return batch[0][0][0].device.index if self.on_gpu else "cpu"
+
+    @staticmethod
+    def add_model_specific_args(arg_parser) -> argparse.ArgumentParser:
+        """
+        Adds arguments for DQN model
+        Note: these params are fine tuned for Pong env
+        Args:
+            arg_parser: the current argument parser to add to
+        Returns:
+            arg_parser with model specific cargs added
+        """
+        arg_parser.add_argument("--batches_per_epoch", type=int, default=10000, help="number of batches in an epoch")
+        arg_parser.add_argument("--batch_size", type=int, default=32, help="size of the batches")
+        arg_parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
+
+        arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag")
+        arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor")
+
+        arg_parser.add_argument(
+            "--avg_reward_len",
+            type=int,
+            default=100,
+            help="how many episodes to include in avg reward",
+        )
+
+        arg_parser.add_argument(
+            "--entropy_beta", type=float, default=0.01, help="entropy value",
+        )
+
+        return arg_parser
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(add_help=False)
+
+    # trainer args
+    parser = pl.Trainer.add_argparse_args(parser)
+
+    # model args
+    parser = Reinforce.add_model_specific_args(parser)
+    args = parser.parse_args()
+
+    model = Reinforce(**args.__dict__)
+
+    # save checkpoints based on avg_reward
+    checkpoint_callback = ModelCheckpoint(
+        save_top_k=1, monitor="avg_reward", mode="max", period=1, verbose=True
+    )
+
+    seed_everything(123)
+    trainer = pl.Trainer.from_argparse_args(
+        args, deterministic=True, checkpoint_callback=checkpoint_callback
+    )
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/pl_bolts/models/rl/vanilla_policy_gradient_model.py b/pl_bolts/models/rl/vanilla_policy_gradient_model.py
new file mode 100644
index 0000000000..f7d9e6586f
--- /dev/null
+++ b/pl_bolts/models/rl/vanilla_policy_gradient_model.py
@@ -0,0 +1,306 @@
+import argparse
+from collections import OrderedDict
+from typing import Tuple, List
+from warnings import warn
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.optim as optim
+from pytorch_lightning import seed_everything
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.nn.functional import log_softmax, softmax
+from torch.optim.optimizer import Optimizer
+from torch.utils.data import DataLoader
+
+from pl_bolts.datamodules import ExperienceSourceDataset
+from pl_bolts.models.rl.common.agents import PolicyAgent
+from pl_bolts.models.rl.common.networks import MLP
+try:
+    import gym
+except ModuleNotFoundError:
+    warn('You want to use `gym` which is not installed yet, install it with `pip install gym`.')  # pragma: no-cover
+    _GYM_AVAILABLE = False
+else:
+    _GYM_AVAILABLE = True
+
+
+class VanillaPolicyGradient(pl.LightningModule):
+    def __init__(
+        self,
+        env: str,
+        gamma: float = 0.99,
+        lr: float = 0.01,
+        batch_size: int = 8,
+        n_steps: int = 10,
+        avg_reward_len: int = 100,
+        entropy_beta: float = 0.01,
+        epoch_len: int = 1000,
+        **kwargs
+    ) -> None:
+        """
+        PyTorch Lightning implementation of `Vanilla Policy Gradient
+        <https://papers.nips.cc/paper/
+        1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_
+        Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour
+        Model implemented by:
+
+            - `Donal Byrne <https://github.com/djbyrne>`
+
+        Example:
+            >>> from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient
+            ...
+            >>> model = VanillaPolicyGradient("CartPole-v0")
+
+        Train::
+            trainer = Trainer()
+            trainer.fit(model)
+
+        Args:
+            env: gym environment tag
+            gamma: discount factor
+            lr: learning rate
+            batch_size: size of minibatch pulled from the DataLoader
+            batch_episodes: how many episodes to rollout for each batch of training
+            entropy_beta: dictates the level of entropy per batch
+            avg_reward_len: how many episodes to take into account when calculating the avg reward
+
+        Note:
+            This example is based on:
+            https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/04_cartpole_pg.py
+
+        Note:
+            Currently only supports CPU and single GPU training with `distributed_backend=dp`
+        """
+        super().__init__()
+
+        if not _GYM_AVAILABLE:
+            raise ModuleNotFoundError('This Module requires gym environment which is not installed yet.')
+
+        # Hyperparameters
+        self.lr = lr
+        self.batch_size = batch_size
+        self.batches_per_epoch = self.batch_size * epoch_len
+        self.entropy_beta = entropy_beta
+        self.gamma = gamma
+        self.n_steps = n_steps
+
+        self.save_hyperparameters()
+
+        # Model components
+        self.env = gym.make(env)
+        self.net = MLP(self.env.observation_space.shape, self.env.action_space.n)
+        self.agent = PolicyAgent(self.net)
+
+        # Tracking metrics
+        self.total_rewards = []
+        self.episode_rewards = []
+        self.done_episodes = 0
+        self.avg_rewards = 0
+        self.avg_reward_len = avg_reward_len
+        self.eps = np.finfo(np.float32).eps.item()
+        self.batch_states = []
+        self.batch_actions = []
+
+        self.state = self.env.reset()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Passes in a state x through the network and gets the q_values of each action as an output
+        Args:
+            x: environment state
+        Returns:
+            q values
+        """
+        output = self.net(x)
+        return output
+
+    def train_batch(
+        self,
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]:
+        """
+        Contains the logic for generating a new batch of data to be passed to the DataLoader
+        Returns:
+            yields a tuple of Lists containing tensors for states, actions and rewards of the batch.
+        """
+
+        while True:
+
+            action = self.agent(self.state, self.device)
+
+            next_state, reward, done, _ = self.env.step(action[0])
+
+            self.episode_rewards.append(reward)
+            self.batch_actions.append(action)
+            self.batch_states.append(self.state)
+            self.state = next_state
+
+            if done:
+                self.done_episodes += 1
+                self.state = self.env.reset()
+                self.total_rewards.append(sum(self.episode_rewards))
+                self.avg_rewards = float(np.mean(self.total_rewards[-self.avg_reward_len:]))
+
+                returns = self.compute_returns(self.episode_rewards)
+
+                for idx in range(len(self.batch_actions)):
+                    yield self.batch_states[idx], self.batch_actions[idx], returns[idx]
+
+                self.batch_states = []
+                self.batch_actions = []
+                self.episode_rewards = []
+
+    def compute_returns(self, rewards):
+        """
+        Calculate the discounted rewards of the batched rewards
+
+        Args:
+            rewards: list of batched rewards
+
+        Returns:
+            list of discounted rewards
+        """
+        reward = 0
+        returns = []
+
+        for r in rewards[::-1]:
+            reward = r + self.gamma * reward
+            returns.insert(0, reward)
+
+        returns = torch.tensor(returns)
+        returns = (returns - returns.mean()) / (returns.std() + self.eps)
+
+        return returns
+
+    def loss(self, states, actions, scaled_rewards) -> torch.Tensor:
+        """
+        Calculates the loss for VPG
+
+        Args:
+            states: batched states
+            actions: batch actions
+            scaled_rewards: batche Q values
+
+        Returns:
+            loss for the current batch
+        """
+
+        logits = self.net(states)
+
+        # policy loss
+        log_prob = log_softmax(logits, dim=1)
+        log_prob_actions = scaled_rewards * log_prob[range(self.batch_size), actions[0]]
+        policy_loss = -log_prob_actions.mean()
+
+        # entropy loss
+        prob = softmax(logits, dim=1)
+        entropy = -(prob * log_prob).sum(dim=1).mean()
+        entropy_loss = -self.entropy_beta * entropy
+
+        # total loss
+        loss = policy_loss + entropy_loss
+
+        return loss
+
+    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict:
+        """
+        Carries out a single step through the environment to update the replay buffer.
+        Then calculates loss based on the minibatch recieved
+        Args:
+            batch: current mini batch of replay data
+            _: batch number, not used
+        Returns:
+            Training loss and log metrics
+        """
+        states, actions, scaled_rewards = batch
+
+        loss = self.loss(states, actions, scaled_rewards)
+
+        log = {
+            "episodes": self.done_episodes,
+            "reward": self.total_rewards[-1],
+            "avg_reward": self.avg_rewards,
+        }
+        return OrderedDict(
+            {
+                "loss": loss,
+                "avg_reward": self.avg_rewards,
+                "log": log,
+                "progress_bar": log,
+            }
+        )
+
+    def configure_optimizers(self) -> List[Optimizer]:
+        """ Initialize Adam optimizer"""
+        optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
+        return [optimizer]
+
+    def _dataloader(self) -> DataLoader:
+        """Initialize the Replay Buffer dataset used for retrieving experiences"""
+        dataset = ExperienceSourceDataset(self.train_batch)
+        dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size)
+        return dataloader
+
+    def train_dataloader(self) -> DataLoader:
+        """Get train loader"""
+        return self._dataloader()
+
+    def get_device(self, batch) -> str:
+        """Retrieve device currently being used by minibatch"""
+        return batch[0][0][0].device.index if self.on_gpu else "cpu"
+
+    @staticmethod
+    def add_model_specific_args(arg_parser) -> argparse.ArgumentParser:
+        """
+        Adds arguments for DQN model
+        Note: these params are fine tuned for Pong env
+        Args:
+            arg_parser: the current argument parser to add to
+        Returns:
+            arg_parser with model specific cargs added
+        """
+
+        arg_parser.add_argument("--entropy_beta", type=float, default=0.01, help="entropy value")
+        arg_parser.add_argument("--batches_per_epoch", type=int, default=10000, help="number of batches in an epoch")
+        arg_parser.add_argument("--batch_size", type=int, default=32, help="size of the batches")
+        arg_parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
+        arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag")
+        arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor")
+        arg_parser.add_argument("--seed", type=int, default=123, help="seed for training run")
+
+        arg_parser.add_argument(
+            "--avg_reward_len",
+            type=int,
+            default=100,
+            help="how many episodes to include in avg reward",
+        )
+
+        return arg_parser
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(add_help=False)
+
+    # trainer args
+    parser = pl.Trainer.add_argparse_args(parser)
+
+    # model args
+    parser = VanillaPolicyGradient.add_model_specific_args(parser)
+    args = parser.parse_args()
+
+    model = VanillaPolicyGradient(**args.__dict__)
+
+    # save checkpoints based on avg_reward
+    checkpoint_callback = ModelCheckpoint(
+        save_top_k=1, monitor="avg_reward", mode="max", period=1, verbose=True
+    )
+
+    seed_everything(123)
+    trainer = pl.Trainer.from_argparse_args(
+        args, deterministic=True, checkpoint_callback=checkpoint_callback
+    )
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/pl_bolts/models/self_supervised/byol/byol_module.py b/pl_bolts/models/self_supervised/byol/byol_module.py
index b6fb83cf52..95c68bbee7 100644
--- a/pl_bolts/models/self_supervised/byol/byol_module.py
+++ b/pl_bolts/models/self_supervised/byol/byol_module.py
@@ -29,7 +29,7 @@ def __init__(self,
         PyTorch Lightning implementation of `Bootstrap Your Own Latent (BYOL)
         <https://arxiv.org/pdf/2006.07733.pdf>`_
 
-        Paper authors: Jean-Bastien Grill ,Florian Strub, Florent Altché, Corentin Tallec, Pierre H. Richemond, \
+        Paper authors: Jean-Bastien Grill, Florian Strub, Florent Altché, Corentin Tallec, Pierre H. Richemond, \
         Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Daniel Guo, Mohammad Gheshlaghi Azar, \
         Bilal Piot, Koray Kavukcuoglu, Rémi Munos, Michal Valko.
 
@@ -136,19 +136,17 @@ def training_step(self, batch, batch_idx):
         loss_a, loss_b, total_loss = self.shared_step(batch, batch_idx)
 
         # log results
-        result = pl.TrainResult(minimize=total_loss)
-        result.log_dict({'1_2_loss': loss_a, '2_1_loss': loss_b, 'train_loss': total_loss})
+        self.log_dict({'1_2_loss': loss_a, '2_1_loss': loss_b, 'train_loss': total_loss})
 
-        return result
+        return total_loss
 
     def validation_step(self, batch, batch_idx):
         loss_a, loss_b, total_loss = self.shared_step(batch, batch_idx)
 
         # log results
-        result = pl.EvalResult(early_stop_on=total_loss, checkpoint_on=total_loss)
-        result.log_dict({'1_2_loss': loss_a, '2_1_loss': loss_b, 'train_loss': total_loss})
+        self.log_dict({'1_2_loss': loss_a, '2_1_loss': loss_b, 'train_loss': total_loss})
 
-        return result
+        return total_loss
 
     def configure_optimizers(self):
         optimizer = Adam(self.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay)
diff --git a/pl_bolts/models/self_supervised/cpc/cpc_module.py b/pl_bolts/models/self_supervised/cpc/cpc_module.py
index f4eff40b00..4cdaa74025 100644
--- a/pl_bolts/models/self_supervised/cpc/cpc_module.py
+++ b/pl_bolts/models/self_supervised/cpc/cpc_module.py
@@ -35,7 +35,7 @@ class CPCV2(pl.LightningModule):
     def __init__(
             self,
             datamodule: pl.LightningDataModule = None,
-            encoder: Union[str, torch.nn.Module, pl.LightningModule] = 'cpc_encoder',
+            encoder_name: str = 'cpc_encoder',
             patch_size: int = 8,
             patch_overlap: int = 4,
             online_ft: int = True,
@@ -50,7 +50,7 @@ def __init__(
         """
         Args:
             datamodule: A Datamodule (optional). Otherwise set the dataloaders directly
-            encoder: A string for any of the resnets in torchvision, or the original CPC encoder,
+            encoder_name: A string for any of the resnets in torchvision, or the original CPC encoder,
                 or a custon nn.Module encoder
             patch_size: How big to make the image patches
             patch_overlap: How much overlap should each patch have.
@@ -66,28 +66,20 @@ def __init__(
         super().__init__()
         self.save_hyperparameters()
 
+        # HACK - datamodule not pickleable so we remove it from hparams.
+        # TODO - remove datamodule from init. data should be decoupled from models.
+        del self.hparams['datamodule']
+
         self.online_evaluator = self.hparams.online_ft
 
         if pretrained:
             self.hparams.dataset = pretrained
             self.online_evaluator = True
 
-        # link data
-        # if datamodule is None:
-        #     datamodule = CIFAR10DataModule(
-        #         self.hparams.data_dir,
-        #         num_workers=self.hparams.num_workers,
-        #         batch_size=batch_size
-        #     )
-        #     datamodule.train_transforms = CPCTrainTransformsCIFAR10()
-        #     datamodule.val_transforms = CPCEvalTransformsCIFAR10()
         assert datamodule
         self.datamodule = datamodule
 
-        # init encoder
-        self.encoder = encoder
-        if isinstance(encoder, str):
-            self.encoder = self.init_encoder()
+        self.encoder = self.init_encoder()
 
         # info nce loss
         c, h = self.__compute_final_nb_c(self.hparams.patch_size)
@@ -97,20 +89,22 @@ def __init__(
         self.num_classes = self.datamodule.num_classes
 
         if pretrained:
-            self.load_pretrained(encoder)
+            self.load_pretrained(self.hparams.encoder_name)
+
+        print(self.hparams)
 
-    def load_pretrained(self, encoder):
+    def load_pretrained(self, encoder_name):
         available_weights = {'resnet18'}
 
-        if encoder in available_weights:
-            load_pretrained(self, f'CPCV2-{encoder}')
-        elif available_weights not in available_weights:
-            rank_zero_warn(f'{encoder} not yet available')
+        if encoder_name in available_weights:
+            load_pretrained(self, f'CPCV2-{encoder_name}')
+        elif encoder_name not in available_weights:
+            rank_zero_warn(f'{encoder_name} not yet available')
 
     def init_encoder(self):
         dummy_batch = torch.zeros((2, 3, self.hparams.patch_size, self.hparams.patch_size))
 
-        encoder_name = self.hparams.encoder
+        encoder_name = self.hparams.encoder_name
         if encoder_name == 'cpc_encoder':
             return cpc_resnet101(dummy_batch)
         else:
@@ -160,18 +154,16 @@ def training_step(self, batch, batch_nb):
         nce_loss = self.shared_step(batch)
 
         # result
-        result = pl.TrainResult(nce_loss)
-        result.log('train_nce_loss', nce_loss)
-        return result
+        self.log('train_nce_loss', nce_loss)
+        return nce_loss
 
     def validation_step(self, batch, batch_nb):
         # calculate loss
         nce_loss = self.shared_step(batch)
 
         # result
-        result = pl.EvalResult(checkpoint_on=nce_loss)
-        result.log('val_nce', nce_loss, prog_bar=True)
-        return result
+        self.log('val_nce', nce_loss, prog_bar=True)
+        return nce_loss
 
     def shared_step(self, batch):
         try:
diff --git a/pl_bolts/models/self_supervised/simclr/simclr_module.py b/pl_bolts/models/self_supervised/simclr/simclr_module.py
index 7fbe562827..582883991a 100644
--- a/pl_bolts/models/self_supervised/simclr/simclr_module.py
+++ b/pl_bolts/models/self_supervised/simclr/simclr_module.py
@@ -157,16 +157,14 @@ def forward(self, x):
     def training_step(self, batch, batch_idx):
         loss = self.shared_step(batch, batch_idx)
 
-        result = pl.TrainResult(minimize=loss)
-        result.log('train_loss', loss, on_epoch=True)
-        return result
+        self.log('train_loss', loss, on_epoch=True)
+        return loss
 
     def validation_step(self, batch, batch_idx):
         loss = self.shared_step(batch, batch_idx)
 
-        result = pl.EvalResult(checkpoint_on=loss)
-        result.log('avg_val_loss', loss)
-        return result
+        self.log('avg_val_loss', loss)
+        return loss
 
     def shared_step(self, batch, batch_idx):
         (img1, img2), y = batch
diff --git a/pl_bolts/models/self_supervised/ssl_finetuner.py b/pl_bolts/models/self_supervised/ssl_finetuner.py
index d3a3e95377..f07e697a42 100644
--- a/pl_bolts/models/self_supervised/ssl_finetuner.py
+++ b/pl_bolts/models/self_supervised/ssl_finetuner.py
@@ -59,21 +59,18 @@ def on_train_epoch_start(self) -> None:
 
     def training_step(self, batch, batch_idx):
         loss, acc = self.shared_step(batch)
-        result = pl.TrainResult(loss)
-        result.log('train_acc', acc, prog_bar=True)
-        return result
+        self.log('train_acc', acc, prog_bar=True)
+        return loss
 
     def validation_step(self, batch, batch_idx):
         loss, acc = self.shared_step(batch)
-        result = pl.EvalResult(checkpoint_on=loss, early_stop_on=loss)
-        result.log_dict({'val_acc': acc, 'val_loss': loss}, prog_bar=True)
-        return result
+        self.log_dict({'val_acc': acc, 'val_loss': loss}, prog_bar=True)
+        return loss
 
     def test_step(self, batch, batch_idx):
         loss, acc = self.shared_step(batch)
-        result = pl.EvalResult()
-        result.log_dict({'test_acc': acc, 'test_loss': loss})
-        return result
+        self.log_dict({'test_acc': acc, 'test_loss': loss})
+        return loss
 
     def shared_step(self, batch):
         x, y = batch
diff --git a/pl_bolts/models/vision/__init__.py b/pl_bolts/models/vision/__init__.py
index 8d4ec5084e..e525036d34 100644
--- a/pl_bolts/models/vision/__init__.py
+++ b/pl_bolts/models/vision/__init__.py
@@ -1,2 +1,2 @@
 from pl_bolts.models.vision.pixel_cnn import PixelCNN
-from pl_bolts.models.vision.unet import UNet
\ No newline at end of file
+from pl_bolts.models.vision.unet import UNet
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000..c434a7c377
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+pytorch-lightning>=0.10.0
+torch>=1.6
\ No newline at end of file
diff --git a/requirements/base.txt b/requirements/base.txt
deleted file mode 100644
index 62766e3de2..0000000000
--- a/requirements/base.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-pytorch-lightning>=0.9.1rc3
-torch>=1.6
\ No newline at end of file
diff --git a/requirements/devel.txt b/requirements/devel.txt
index 53b6b26d05..3574b167e4 100644
--- a/requirements/devel.txt
+++ b/requirements/devel.txt
@@ -1,5 +1,5 @@
 # install all mandatory dependencies
--r ./base.txt
+-r ../requirements.txt
 
 # install all extra dependencies for full package experience
 -r ./models.txt
diff --git a/requirements/models.txt b/requirements/models.txt
index 174ab691fc..a92a7ef6bd 100644
--- a/requirements/models.txt
+++ b/requirements/models.txt
@@ -1,4 +1,5 @@
 torchvision>=0.7
 scikit-learn>=0.23
 Pillow
-opencv-python
\ No newline at end of file
+opencv-python
+gym>=0.17.2  # needed for RL
\ No newline at end of file
diff --git a/requirements/test.txt b/requirements/test.txt
index 70b0ce4600..c97d36fc50 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -8,5 +8,4 @@ flake8-black
 check-manifest
 twine==1.13.0
 
-# atari-py==0.2.6  # needed for RL
-# gym>=0.17.2  # needed for RL
\ No newline at end of file
+atari-py==0.2.6  # needed for RL
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 456c6b9153..29931f94ff 100755
--- a/setup.py
+++ b/setup.py
@@ -19,8 +19,8 @@
 import pl_bolts  # noqa: E402
 
 
-def load_requirements(path_dir=PATH_ROOT, file_name='base.txt', comment_char='#'):
-    with open(os.path.join(path_dir, 'requirements', file_name), 'r') as file:
+def load_requirements(path_dir=PATH_ROOT, file_name='requirements.txt', comment_char='#'):
+    with open(os.path.join(path_dir, file_name), 'r') as file:
         lines = [ln.strip() for ln in file.readlines()]
     reqs = []
     for ln in lines:
@@ -45,9 +45,9 @@ def load_long_describtion():
 
 
 extras = {
-    'loggers': load_requirements(file_name='loggers.txt'),
-    'models': load_requirements(file_name='models.txt'),
-    'test': load_requirements(file_name='test.txt'),
+    'loggers': load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='loggers.txt'),
+    'models': load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='models.txt'),
+    'test': load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='test.txt'),
 }
 extras['extra'] = extras['models'] + extras['loggers']
 extras['dev'] = extras['extra'] + extras['test']
diff --git a/tests/datamodules/test_experience_sources.py b/tests/datamodules/test_experience_sources.py
new file mode 100644
index 0000000000..737a1c7150
--- /dev/null
+++ b/tests/datamodules/test_experience_sources.py
@@ -0,0 +1,321 @@
+from unittest import TestCase
+from unittest.mock import Mock
+
+import gym
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+from pl_bolts.datamodules.experience_source import (
+    BaseExperienceSource,
+    ExperienceSource,
+    ExperienceSourceDataset,
+    Experience,
+    DiscountedExperienceSource,
+)
+from pl_bolts.models.rl.common.agents import Agent
+
+
+class DummyAgent(Agent):
+    def __call__(self, states, device):
+        return [0] * len(states)
+
+
+class DummyExperienceSource(BaseExperienceSource):
+    def __iter__(self):
+        yield torch.ones(3)
+
+
+class TestExperienceSourceDataset(TestCase):
+    def train_batch(self):
+        """Returns an iterator used for testing"""
+        return iter([i for i in range(100)])
+
+    def test_iterator(self):
+        """Tests that the iterator returns batches correctly"""
+        source = ExperienceSourceDataset(self.train_batch)
+        batch_size = 10
+        data_loader = DataLoader(source, batch_size=batch_size)
+
+        for idx, batch in enumerate(data_loader):
+            self.assertEqual(len(batch), batch_size)
+            self.assertEqual(batch[0], 0)
+            self.assertEqual(batch[5], 5)
+            break
+
+
+class TestBaseExperienceSource(TestCase):
+    def setUp(self) -> None:
+        self.net = Mock()
+        self.agent = DummyAgent(net=self.net)
+        self.env = gym.make("CartPole-v0")
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.source = DummyExperienceSource(self.env, self.agent)
+        self.s1 = torch.ones(3)
+        self.s2 = torch.zeros(3)
+
+    def test_dummy_base_class(self):
+        """Tests that base class is initialized correctly"""
+        self.assertTrue(isinstance(self.source.env, gym.Env))
+        self.assertTrue(isinstance(self.source.agent, Agent))
+        out = next(iter(self.source))
+        self.assertTrue(torch.all(out.eq(torch.ones(3))))
+
+
+class TestExperienceSource(TestCase):
+    def setUp(self) -> None:
+        self.net = Mock()
+        self.agent = DummyAgent(net=self.net)
+        self.env = [gym.make("CartPole-v0") for _ in range(2)]
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.source = ExperienceSource(self.env, self.agent, n_steps=1)
+
+        self.s1 = torch.ones(3)
+        self.s2 = torch.zeros(3)
+
+        self.mock_env = Mock()
+        self.mock_env.step = Mock(return_value=(self.s1, 1, False, Mock()))
+
+        self.exp1 = Experience(state=self.s1, action=1, reward=1, done=False, new_state=self.s2)
+        self.exp2 = Experience(state=self.s1, action=1, reward=1, done=False, new_state=self.s2)
+
+    def test_init_source(self):
+        """Test that experience source is setup correctly"""
+        self.assertEqual(self.source.n_steps, 1)
+        self.assertIsInstance(self.source.pool, list)
+
+        self.assertEqual(len(self.source.states), len(self.source.pool))
+        self.assertEqual(len(self.source.histories), len(self.source.pool))
+        self.assertEqual(len(self.source.cur_rewards), len(self.source.pool))
+        self.assertEqual(len(self.source.cur_steps), len(self.source.pool))
+
+    def test_init_single_env(self):
+        """Test that if a single env is passed that it is wrapped in a list"""
+        self.source = ExperienceSource(self.mock_env, self.agent)
+        self.assertIsInstance(self.source.pool, list)
+
+    def test_env_actions(self):
+        """Assert that a list of actions of shape [num_envs, action_len] is returned"""
+        actions = self.source.env_actions(self.device)
+        self.assertEqual(len(actions), len(self.env))
+        self.assertTrue(isinstance(actions[0], list))
+
+    def test_env_step(self):
+        """Assert that taking a step through a single environment yields a list of history steps"""
+        actions = [[1], [1]]
+        env = self.env[0]
+        exp = self.source.env_step(0, env, actions[0])
+
+        self.assertTrue(isinstance(exp, Experience))
+
+    def test_source_next_single_env_single_step(self):
+        """Test that steps are executed correctly with one environment and 1 step"""
+
+        self.env = [gym.make("CartPole-v0") for _ in range(1)]
+        self.source = ExperienceSource(self.env, self.agent, n_steps=1)
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+            self.assertTrue(isinstance(exp, tuple))
+            break
+
+    def test_source_next_single_env_multi_step(self):
+        """Test that steps are executed correctly with one environment and 2 step"""
+
+        self.env = [gym.make("CartPole-v0") for _ in range(1)]
+        n_steps = 4
+        self.source = ExperienceSource(self.env, self.agent, n_steps=n_steps)
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+            self.assertTrue(isinstance(exp, tuple))
+            self.assertTrue(len(exp) == n_steps)
+            break
+
+    def test_source_next_multi_env_single_step(self):
+        """Test that steps are executed correctly with 2 environment and 1 step"""
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+            self.assertTrue(isinstance(exp, tuple))
+            self.assertTrue(len(exp) == self.source.n_steps)
+            break
+
+    def test_source_next_multi_env_multi_step(self):
+        """Test that steps are executed correctly with 2 environment and 2 step"""
+        self.source = ExperienceSource(self.env, self.agent, n_steps=2)
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+            self.assertTrue(isinstance(exp, tuple))
+            self.assertTrue(len(exp) == self.source.n_steps)
+            break
+
+    def test_source_update_state(self):
+        """Test that after a step the state is updated"""
+
+        self.env = [gym.make("CartPole-v0") for _ in range(1)]
+        self.source = ExperienceSource(self.env, self.agent, n_steps=2)
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+            self.assertTrue(isinstance(exp, tuple))
+            new = np.asarray(exp[-1].new_state)
+            old = np.asarray(self.source.states[0])
+            self.assertTrue(np.array_equal(new, old))
+            break
+
+    def test_source_is_done_short_episode(self):
+        """Test that when done and the history is not full, to return the partial history"""
+
+        self.mock_env.step = Mock(return_value=(self.s1, 1, True, Mock))
+
+        env = [self.mock_env for _ in range(1)]
+        self.source = ExperienceSource(env, self.agent, n_steps=2)
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+            self.assertTrue(isinstance(exp, tuple))
+            self.assertTrue(len(exp) == 1)
+            break
+
+    def test_source_is_done_2step_episode(self):
+        """
+        Test that when done and the history is full, return the full history, then start to return the tail of
+        the history
+        """
+
+        self.env = [self.mock_env]
+        self.source = ExperienceSource(self.env, self.agent, n_steps=2)
+
+        self.mock_env.step = Mock(return_value=(self.s1, 1, True, Mock))
+
+        self.source.histories[0].append(self.exp1)
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+
+            self.assertTrue(isinstance(exp, tuple))
+
+            if idx == 0:
+                self.assertTrue(len(exp) == self.source.n_steps)
+            elif idx == 1:
+                self.assertTrue(len(exp) == self.source.n_steps - 1)
+                self.assertTrue(torch.equal(exp[0].new_state, self.s1))
+
+                break
+
+    def test_source_is_done_metrics(self):
+        """Test that when done and the history is full, return the full history"""
+
+        n_steps = 3
+        n_envs = 2
+
+        self.mock_env.step = Mock(return_value=(self.s1, 1, True, Mock))
+
+        self.env = [self.mock_env for _ in range(2)]
+        self.source = ExperienceSource(self.env, self.agent, n_steps=3)
+
+        history = self.source.histories[0]
+        history += [self.exp1, self.exp2, self.exp2]
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+
+            if idx == n_steps - 1:
+                self.assertEqual(self.source._total_rewards[0], 1)
+                self.assertEqual(self.source.total_steps[0], 1)
+                self.assertEqual(self.source.cur_rewards[0], 0)
+                self.assertEqual(self.source.cur_steps[0], 0)
+            elif idx == (3 * n_envs) - 1:
+                self.assertEqual(self.source.iter_idx, 1)
+                break
+
+    def test_pop_total_rewards(self):
+        """Test that pop rewards returns correct rewards"""
+        self.source._total_rewards = [10, 20, 30]
+
+        rewards = self.source.pop_total_rewards()
+
+        self.assertEqual(rewards, [10, 20, 30])
+
+
+class TestDiscountedExperienceSource(TestCase):
+    def setUp(self) -> None:
+        self.net = Mock()
+        self.agent = DummyAgent(net=self.net)
+        self.env = [gym.make("CartPole-v0") for _ in range(2)]
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.n_steps = 3
+        self.gamma = 0.9
+        self.source = DiscountedExperienceSource(
+            self.env, self.agent, n_steps=self.n_steps, gamma=self.gamma
+        )
+
+        self.state = torch.ones(3)
+        self.next_state = torch.zeros(3)
+        self.reward = 1
+
+        self.exp1 = Experience(
+            state=self.state,
+            action=1,
+            reward=self.reward,
+            done=False,
+            new_state=self.next_state,
+        )
+        self.exp2 = Experience(
+            state=self.next_state,
+            action=1,
+            reward=self.reward,
+            done=False,
+            new_state=self.state,
+        )
+
+        self.env1 = Mock()
+        self.env1.step = Mock(
+            return_value=(self.next_state, self.reward, True, self.state)
+        )
+
+    def test_init(self):
+        """Test that experience source is setup correctly"""
+        self.assertEqual(self.source.n_steps, self.n_steps + 1)
+        self.assertEqual(self.source.steps, self.n_steps)
+        self.assertEqual(self.source.gamma, self.gamma)
+
+    def test_source_step(self):
+        """Tests that the source returns a single experience"""
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+            self.assertTrue(isinstance(exp, Experience))
+            break
+
+    def test_source_step_done(self):
+        """Tests that the source returns a single experience"""
+
+        self.source = DiscountedExperienceSource(
+            self.env1, self.agent, n_steps=self.n_steps
+        )
+
+        self.source.histories[0].append(self.exp1)
+        self.source.histories[0].append(self.exp2)
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+            self.assertTrue(isinstance(exp, Experience))
+            self.assertTrue(torch.all(torch.eq(exp.new_state, self.next_state)))
+            break
+
+    def test_source_discounted_return(self):
+        """
+        Tests that the source returns a single experience with discounted rewards
+
+        discounted returns: G(t) = R(t+1) + γ*R(t+2) + γ^2*R(t+3) ... + γ^N-1*R(t+N)
+        """
+
+        self.source = DiscountedExperienceSource(
+            self.env1, self.agent, n_steps=self.n_steps
+        )
+
+        self.source.histories[0] += [self.exp1, self.exp2]
+
+        discounted_reward = (
+            self.exp1.reward +
+            (self.source.gamma * self.exp2.reward) +
+            (self.source.gamma * self.reward) ** 2
+        )
+
+        for idx, exp in enumerate(self.source.runner(self.device)):
+            self.assertTrue(isinstance(exp, Experience))
+            self.assertEqual(exp.reward, discounted_reward)
+            break
diff --git a/tests/datasets/__init__.py b/tests/datasets/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
new file mode 100644
index 0000000000..c7adda3cda
--- /dev/null
+++ b/tests/datasets/test_datasets.py
@@ -0,0 +1,34 @@
+from pl_bolts.datasets import DummyDataset, RandomDataset, RandomDictDataset, RandomDictStringDataset
+from torch.utils.data import DataLoader
+
+
+def test_dummy_ds(tmpdir):
+    ds = DummyDataset((1, 2), num_samples=100)
+    dl = DataLoader(ds)
+
+    for b in dl:
+        pass
+
+
+def test_rand_ds(tmpdir):
+    ds = RandomDataset(32, num_samples=100)
+    dl = DataLoader(ds)
+
+    for b in dl:
+        pass
+
+
+def test_rand_dict_ds(tmpdir):
+    ds = RandomDictDataset(32, num_samples=100)
+    dl = DataLoader(ds)
+
+    for b in dl:
+        pass
+
+
+def test_rand_str_dict_ds(tmpdir):
+    ds = RandomDictStringDataset(32, num_samples=100)
+    dl = DataLoader(ds)
+
+    for b in dl:
+        pass
diff --git a/tests/losses/test_rl_loss.py b/tests/losses/test_rl_loss.py
new file mode 100644
index 0000000000..e02965f84c
--- /dev/null
+++ b/tests/losses/test_rl_loss.py
@@ -0,0 +1,51 @@
+"""
+Test RL Loss Functions
+"""
+
+from unittest import TestCase
+
+import numpy as np
+import torch
+
+from pl_bolts.losses.rl import dqn_loss, double_dqn_loss, per_dqn_loss
+from pl_bolts.models.rl.common.networks import CNN
+from pl_bolts.models.rl.common.gym_wrappers import make_environment
+
+
+class TestRLLoss(TestCase):
+
+    def setUp(self) -> None:
+
+        self.state = torch.rand(32, 4, 84, 84)
+        self.next_state = torch.rand(32, 4, 84, 84)
+        self.action = torch.ones([32])
+        self.reward = torch.ones([32])
+        self.done = torch.zeros([32]).long()
+
+        self.batch = (self.state, self.action, self.reward, self.done, self.next_state)
+
+        self.env = make_environment("PongNoFrameskip-v4")
+        self.obs_shape = self.env.observation_space.shape
+        self.n_actions = self.env.action_space.n
+        self.net = CNN(self.obs_shape, self.n_actions)
+        self.target_net = CNN(self.obs_shape, self.n_actions)
+
+    def test_dqn_loss(self):
+        """Test the dqn loss function"""
+
+        loss = dqn_loss(self.batch, self.net, self.target_net)
+        self.assertIsInstance(loss, torch.Tensor)
+
+    def test_double_dqn_loss(self):
+        """Test the double dqn loss function"""
+
+        loss = double_dqn_loss(self.batch, self.net, self.target_net)
+        self.assertIsInstance(loss, torch.Tensor)
+
+    def test_per_dqn_loss(self):
+        """Test the double dqn loss function"""
+        prios = torch.ones([32])
+
+        loss, batch_weights = per_dqn_loss(self.batch, prios, self.net, self.target_net)
+        self.assertIsInstance(loss, torch.Tensor)
+        self.assertIsInstance(batch_weights, np.ndarray)
diff --git a/tests/models/rl/__init__.py b/tests/models/rl/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/models/rl/integration/__init__.py b/tests/models/rl/integration/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/models/rl/integration/test_policy_models.py b/tests/models/rl/integration/test_policy_models.py
new file mode 100644
index 0000000000..3c65af9d2e
--- /dev/null
+++ b/tests/models/rl/integration/test_policy_models.py
@@ -0,0 +1,41 @@
+import argparse
+from unittest import TestCase
+
+import pytorch_lightning as pl
+
+from pl_bolts.models.rl.reinforce_model import Reinforce
+from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient
+
+
+class TestPolicyModels(TestCase):
+
+    def setUp(self) -> None:
+        parent_parser = argparse.ArgumentParser(add_help=False)
+        parent_parser = VanillaPolicyGradient.add_model_specific_args(parent_parser)
+        args_list = [
+            "--env", "CartPole-v0"
+        ]
+        self.hparams = parent_parser.parse_args(args_list)
+
+        self.trainer = pl.Trainer(
+            gpus=0,
+            max_steps=100,
+            max_epochs=100,  # Set this as the same as max steps to ensure that it doesn't stop early
+            val_check_interval=1,  # This just needs 'some' value, does not effect training right now
+            fast_dev_run=True
+        )
+
+    def test_reinforce(self):
+        """Smoke test that the reinforce model runs"""
+
+        model = Reinforce(self.hparams.env)
+        result = self.trainer.fit(model)
+
+        self.assertEqual(result, 1)
+
+    def test_policy_gradient(self):
+        """Smoke test that the policy gradient model runs"""
+        model = VanillaPolicyGradient(self.hparams.env)
+        result = self.trainer.fit(model)
+
+        self.assertEqual(result, 1)
diff --git a/tests/models/rl/integration/test_value_models.py b/tests/models/rl/integration/test_value_models.py
new file mode 100644
index 0000000000..f3cbad43ad
--- /dev/null
+++ b/tests/models/rl/integration/test_value_models.py
@@ -0,0 +1,74 @@
+import argparse
+from unittest import TestCase
+
+import pytorch_lightning as pl
+
+from pl_bolts.models.rl.double_dqn_model import DoubleDQN
+from pl_bolts.models.rl.dqn_model import DQN
+from pl_bolts.models.rl.dueling_dqn_model import DuelingDQN
+from pl_bolts.models.rl.noisy_dqn_model import NoisyDQN
+from pl_bolts.models.rl.per_dqn_model import PERDQN
+
+
+class TestValueModels(TestCase):
+
+    def setUp(self) -> None:
+        parent_parser = argparse.ArgumentParser(add_help=False)
+        parent_parser = pl.Trainer.add_argparse_args(parent_parser)
+        parent_parser = DQN.add_model_specific_args(parent_parser)
+        args_list = [
+            "--warm_start_size", "100",
+            "--gpus", "0",
+            "--env", "PongNoFrameskip-v4",
+        ]
+        self.hparams = parent_parser.parse_args(args_list)
+
+        self.trainer = pl.Trainer(
+            gpus=self.hparams.gpus,
+            max_steps=100,
+            max_epochs=100,  # Set this as the same as max steps to ensure that it doesn't stop early
+            val_check_interval=1,  # This just needs 'some' value, does not effect training right now
+            fast_dev_run=True
+        )
+
+    def test_dqn(self):
+        """Smoke test that the DQN model runs"""
+        model = DQN(self.hparams.env, num_envs=5)
+        result = self.trainer.fit(model)
+
+        self.assertEqual(result, 1)
+
+    def test_double_dqn(self):
+        """Smoke test that the Double DQN model runs"""
+        model = DoubleDQN(self.hparams.env)
+        result = self.trainer.fit(model)
+
+        self.assertEqual(result, 1)
+
+    def test_dueling_dqn(self):
+        """Smoke test that the Dueling DQN model runs"""
+        model = DuelingDQN(self.hparams.env)
+        result = self.trainer.fit(model)
+
+        self.assertEqual(result, 1)
+
+    def test_noisy_dqn(self):
+        """Smoke test that the Noisy DQN model runs"""
+        model = NoisyDQN(self.hparams.env)
+        result = self.trainer.fit(model)
+
+        self.assertEqual(result, 1)
+
+    def test_per_dqn(self):
+        """Smoke test that the PER DQN model runs"""
+        model = PERDQN(self.hparams.env)
+        result = self.trainer.fit(model)
+
+        self.assertEqual(result, 1)
+
+    # def test_n_step_dqn(self):
+    #     """Smoke test that the N Step DQN model runs"""
+    #     model = DQN(self.hparams.env, n_steps=self.hparams.n_steps)
+    #     result = self.trainer.fit(model)
+    #
+    #     self.assertEqual(result, 1)
diff --git a/tests/models/rl/test_scripts.py b/tests/models/rl/test_scripts.py
new file mode 100644
index 0000000000..af1d703897
--- /dev/null
+++ b/tests/models/rl/test_scripts.py
@@ -0,0 +1,104 @@
+from unittest import mock
+
+import pytest
+
+
+@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4'
+                                      ' --max_steps 10'
+                                      ' --fast_dev_run'
+                                      ' --warm_start_size 10'
+                                      ' --n_steps 2'
+                                      ' --batch_size 10'])
+def test_cli_run_rl_dqn(cli_args):
+    """Test running CLI for an example with default params."""
+    from pl_bolts.models.rl.dqn_model import cli_main
+
+    cli_args = cli_args.split(' ') if cli_args else []
+    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+        cli_main()
+
+
+@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4'
+                                      ' --max_steps 10'
+                                      ' --fast_dev_run'
+                                      ' --warm_start_size 10'
+                                      ' --n_steps 2'
+                                      ' --batch_size 10'])
+def test_cli_run_rl_double_dqn(cli_args):
+    """Test running CLI for an example with default params."""
+    from pl_bolts.models.rl.double_dqn_model import cli_main
+
+    cli_args = cli_args.split(' ') if cli_args else []
+    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+        cli_main()
+
+
+@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4'
+                                      ' --max_steps 10'
+                                      ' --fast_dev_run'
+                                      ' --warm_start_size 10'
+                                      ' --n_steps 2'
+                                      ' --batch_size 10'])
+def test_cli_run_rl_dueling_dqn(cli_args):
+    """Test running CLI for an example with default params."""
+    from pl_bolts.models.rl.dueling_dqn_model import cli_main
+
+    cli_args = cli_args.split(' ') if cli_args else []
+    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+        cli_main()
+
+
+@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4'
+                                      ' --max_steps 10'
+                                      ' --fast_dev_run'
+                                      ' --warm_start_size 10'
+                                      ' --n_steps 2'
+                                      ' --batch_size 10'])
+def test_cli_run_rl_noisy_dqn(cli_args):
+    """Test running CLI for an example with default params."""
+    from pl_bolts.models.rl.noisy_dqn_model import cli_main
+
+    cli_args = cli_args.split(' ') if cli_args else []
+    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+        cli_main()
+
+
+@pytest.mark.parametrize('cli_args', ['--env PongNoFrameskip-v4'
+                                      ' --max_steps 10'
+                                      ' --fast_dev_run'
+                                      ' --warm_start_size 10'
+                                      ' --n_steps 2'
+                                      ' --batch_size 10'])
+def test_cli_run_rl_per_dqn(cli_args):
+    """Test running CLI for an example with default params."""
+    from pl_bolts.models.rl.per_dqn_model import cli_main
+
+    cli_args = cli_args.split(' ') if cli_args else []
+    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+        cli_main()
+
+
+@pytest.mark.parametrize('cli_args', ['--env CartPole-v0'
+                                      ' --max_steps 10'
+                                      ' --fast_dev_run'
+                                      ' --batch_size 10'])
+def test_cli_run_rl_reinforce(cli_args):
+    """Test running CLI for an example with default params."""
+    from pl_bolts.models.rl.reinforce_model import cli_main
+
+    cli_args = cli_args.split(' ') if cli_args else []
+    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+        cli_main()
+
+
+@pytest.mark.parametrize('cli_args', ['--env CartPole-v0'
+                                      ' --max_steps 10'
+                                      ' --fast_dev_run'
+                                      ' --batch_size 10'])
+def test_cli_run_rl_vanilla_policy_gradient(cli_args):
+    """Test running CLI for an example with default params."""
+    from pl_bolts.models.rl.vanilla_policy_gradient_model import cli_main
+
+    cli_args = cli_args.split(' ') if cli_args else []
+    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+        cli_main()
diff --git a/tests/models/rl/unit/__init__.py b/tests/models/rl/unit/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/models/rl/unit/test_agents.py b/tests/models/rl/unit/test_agents.py
new file mode 100644
index 0000000000..5d4214e59b
--- /dev/null
+++ b/tests/models/rl/unit/test_agents.py
@@ -0,0 +1,62 @@
+"""Tests that the agent module works correctly"""
+from unittest import TestCase
+from unittest.mock import Mock
+
+import gym
+import numpy as np
+import torch
+
+from pl_bolts.models.rl.common.agents import Agent, PolicyAgent, ValueAgent
+
+
+class TestAgents(TestCase):
+
+    def setUp(self) -> None:
+        self.env = gym.make("CartPole-v0")
+        self.state = self.env.reset()
+        self.net = Mock()
+
+    def test_base_agent(self):
+        agent = Agent(self.net)
+        action = agent(self.state, 'cuda:0')
+        self.assertIsInstance(action, list)
+
+
+class TestValueAgent(TestCase):
+
+    def setUp(self) -> None:
+        self.env = gym.make("CartPole-v0")
+        self.net = Mock(return_value=torch.Tensor([[0.0, 100.0]]))
+        self.state = [self.env.reset()]
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.value_agent = ValueAgent(self.net, self.env.action_space.n)
+
+    def test_value_agent(self):
+
+        action = self.value_agent(self.state, self.device)
+        self.assertIsInstance(action, list)
+        self.assertIsInstance(action[0], int)
+
+    def test_value_agent_get_action(self):
+        action = self.value_agent.get_action(self.state, self.device)
+        self.assertIsInstance(action, np.ndarray)
+        self.assertEqual(action[0], 1)
+
+    def test_value_agent_random(self):
+        action = self.value_agent.get_random_action(self.state)
+        self.assertIsInstance(action[0], int)
+
+
+class TestPolicyAgent(TestCase):
+
+    def setUp(self) -> None:
+        self.env = gym.make("CartPole-v0")
+        self.net = Mock(return_value=torch.Tensor([[0.0, 100.0]]))
+        self.states = [self.env.reset()]
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+    def test_policy_agent(self):
+        policy_agent = PolicyAgent(self.net)
+        action = policy_agent(self.states, self.device)
+        self.assertIsInstance(action, list)
+        self.assertEqual(action[0], 1)
diff --git a/tests/models/rl/unit/test_memory.py b/tests/models/rl/unit/test_memory.py
new file mode 100644
index 0000000000..12b89b232e
--- /dev/null
+++ b/tests/models/rl/unit/test_memory.py
@@ -0,0 +1,286 @@
+from unittest import TestCase
+from unittest.mock import Mock
+
+import numpy as np
+import torch
+
+from pl_bolts.models.rl.common.memory import ReplayBuffer, Experience, PERBuffer, MultiStepBuffer, Buffer
+
+
+class TestBuffer(TestCase):
+
+    def train_batch(self):
+        """Returns an iterator used for testing"""
+        return iter([i for i in range(100)])
+
+    def setUp(self) -> None:
+        self.state = np.random.rand(4, 84, 84)
+        self.next_state = np.random.rand(4, 84, 84)
+        self.action = np.ones([1])
+        self.reward = np.ones([1])
+        self.done = np.zeros([1])
+        self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state)
+        self.source = Mock()
+        self.source.step = Mock(return_value=(self.experience, torch.tensor(0), False))
+        self.batch_size = 8
+        self.buffer = Buffer(8)
+
+        for _ in range(self.batch_size):
+            self.buffer.append(self.experience)
+
+    def test_sample_batch(self):
+        """check that a sinlge sample is returned"""
+        sample = self.buffer.sample()
+        self.assertEqual(len(sample), 5)
+        self.assertEqual(sample[0].shape, (self.batch_size, 4, 84, 84))
+        self.assertEqual(sample[1].shape, (self.batch_size, 1))
+        self.assertEqual(sample[2].shape, (self.batch_size, 1))
+        self.assertEqual(sample[3].shape, (self.batch_size, 1))
+        self.assertEqual(sample[4].shape, (self.batch_size, 4, 84, 84))
+
+
+class TestReplayBuffer(TestCase):
+
+    def setUp(self) -> None:
+        self.state = np.random.rand(32, 32)
+        self.next_state = np.random.rand(32, 32)
+        self.action = np.ones([1])
+        self.reward = np.ones([1])
+        self.done = np.zeros([1])
+        self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state)
+
+        self.source = Mock()
+        self.source.step = Mock(return_value=(self.experience, torch.tensor(0), False))
+        self.warm_start = 10
+        self.buffer = ReplayBuffer(20)
+        for _ in range(self.warm_start):
+            self.buffer.append(self.experience)
+
+    def test_replay_buffer_append(self):
+        """Test that you can append to the replay buffer"""
+
+        self.assertEqual(len(self.buffer), self.warm_start)
+
+        self.buffer.append(self.experience)
+
+        self.assertEqual(len(self.buffer), self.warm_start + 1)
+
+    def test_replay_buffer_populate(self):
+        """Tests that the buffer is populated correctly with warm_start"""
+        self.assertEqual(len(self.buffer.buffer), self.warm_start)
+
+    def test_replay_buffer_update(self):
+        """Tests that buffer append works correctly"""
+        batch_size = 3
+        self.assertEqual(len(self.buffer.buffer), self.warm_start)
+        for i in range(batch_size):
+            self.buffer.append(self.experience)
+        self.assertEqual(len(self.buffer.buffer), self.warm_start + batch_size)
+
+    def test_replay_buffer_sample(self):
+        """Test that you can sample from the buffer and the outputs are the correct shape"""
+        batch_size = 3
+
+        for i in range(10):
+            self.buffer.append(self.experience)
+
+        batch = self.buffer.sample(batch_size)
+
+        self.assertEqual(len(batch), 5)
+
+        # states
+        states = batch[0]
+        self.assertEqual(states.shape, (batch_size, 32, 32))
+        # action
+        actions = batch[1]
+        self.assertEqual(actions.shape, (batch_size, 1))
+        # reward
+        rewards = batch[2]
+        self.assertEqual(rewards.shape, (batch_size, 1))
+        # dones
+        dones = batch[3]
+        self.assertEqual(dones.shape, (batch_size, 1))
+        # next states
+        next_states = batch[4]
+        self.assertEqual(next_states.shape, (batch_size, 32, 32))
+
+
+class TestPrioReplayBuffer(TestCase):
+
+    def setUp(self) -> None:
+        self.buffer = PERBuffer(10)
+
+        self.state = np.random.rand(32, 32)
+        self.next_state = np.random.rand(32, 32)
+        self.action = np.ones([1])
+        self.reward = np.ones([1])
+        self.done = np.zeros([1])
+        self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state)
+
+    def test_replay_buffer_append(self):
+        """Test that you can append to the replay buffer and the latest experience has max priority"""
+
+        self.assertEqual(len(self.buffer), 0)
+
+        self.buffer.append(self.experience)
+
+        self.assertEqual(len(self.buffer), 1)
+        self.assertEqual(self.buffer.priorities[0], 1.0)
+
+    def test_replay_buffer_sample(self):
+        """Test that you can sample from the buffer and the outputs are the correct shape"""
+        batch_size = 3
+
+        for i in range(10):
+            self.buffer.append(self.experience)
+
+        batch, indices, weights = self.buffer.sample(batch_size)
+
+        self.assertEqual(len(batch), 5)
+        self.assertEqual(len(indices), batch_size)
+        self.assertEqual(len(weights), batch_size)
+
+        # states
+        states = batch[0]
+        self.assertEqual(states.shape, (batch_size, 32, 32))
+        # action
+        actions = batch[1]
+        self.assertEqual(actions.shape, (batch_size, 1))
+        # reward
+        rewards = batch[2]
+        self.assertEqual(rewards.shape, (batch_size, 1))
+        # dones
+        dones = batch[3]
+        self.assertEqual(dones.shape, (batch_size, 1))
+        # next states
+        next_states = batch[4]
+        self.assertEqual(next_states.shape, (batch_size, 32, 32))
+
+
+class TestMultiStepReplayBuffer(TestCase):
+
+    def setUp(self) -> None:
+        self.gamma = 0.9
+        self.buffer = MultiStepBuffer(capacity=10, n_steps=2, gamma=self.gamma)
+
+        self.state = np.zeros([32, 32])
+        self.state_02 = np.ones([32, 32])
+        self.next_state = np.zeros([32, 32])
+        self.next_state_02 = np.ones([32, 32])
+        self.action = np.zeros([1])
+        self.action_02 = np.ones([1])
+        self.reward = np.zeros([1])
+        self.reward_02 = np.ones([1])
+        self.done = np.zeros([1])
+        self.done_02 = np.zeros([1])
+
+        self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state)
+        self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
+        self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
+
+    def test_append_single_experience_less_than_n(self):
+        """
+        If a single experience is added and n > 1 nothing should be added to the buffer as it is waiting experiences
+        to equal n
+        """
+        self.assertEqual(len(self.buffer), 0)
+
+        self.buffer.append(self.experience01)
+
+        self.assertEqual(len(self.buffer), 0)
+
+    def test_append_single_experience(self):
+        """
+        If a single experience is added and n > 1 nothing should be added to the buffer as it is waiting experiences
+        to equal n
+        """
+        self.assertEqual(len(self.buffer), 0)
+
+        self.buffer.append(self.experience01)
+
+        self.assertEqual(len(self.buffer.exp_history_queue), 0)
+        self.assertEqual(len(self.buffer.history), 1)
+
+    def test_append_single_experience2(self):
+        """
+        If a single experience is added and the number of experiences collected >= n, the multi step experience should
+        be added to the full buffer.
+        """
+        self.assertEqual(len(self.buffer), 0)
+
+        self.buffer.append(self.experience01)
+        self.buffer.append(self.experience02)
+
+        self.assertEqual(len(self.buffer.buffer), 1)
+        self.assertEqual(len(self.buffer.history), 2)
+
+    def test_sample_single_experience(self):
+        """if there is only a single experience added, sample should return nothing"""
+        self.buffer.append(self.experience01)
+
+        with self.assertRaises(Exception) as context:
+            _ = self.buffer.sample(batch_size=1)
+
+        self.assertIsInstance(context.exception, Exception)
+
+    def test_sample_multi_experience(self):
+        """if there is only a single experience added, sample should return nothing"""
+        self.buffer.append(self.experience01)
+        self.buffer.append(self.experience02)
+
+        batch = self.buffer.sample(batch_size=1)
+
+        next_state = batch[4]
+        self.assertEqual(next_state.all(), self.next_state_02.all())
+
+    def test_get_transition_info_2_step(self):
+        """Test that the accumulated experience is correct and"""
+        self.buffer.append(self.experience01)
+        self.buffer.append(self.experience02)
+
+        reward = self.buffer.buffer[0].reward
+        next_state = self.buffer.buffer[0].new_state
+        done = self.buffer.buffer[0].done
+
+        reward_gt = self.experience01.reward + (self.gamma * self.experience02.reward) * (1 - done)
+
+        self.assertEqual(reward, reward_gt)
+        self.assertEqual(next_state.all(), self.next_state_02.all())
+        self.assertEqual(self.experience02.done, done)
+
+    def test_get_transition_info_3_step(self):
+        """Test that the accumulated experience is correct with multi step"""
+        self.buffer = MultiStepBuffer(capacity=10, n_steps=3, gamma=self.gamma)
+
+        self.buffer.append(self.experience01)
+        self.buffer.append(self.experience02)
+        self.buffer.append(self.experience02)
+
+        reward = self.buffer.buffer[0].reward
+        next_state = self.buffer.buffer[0].new_state
+        done = self.buffer.buffer[0].done
+
+        reward_01 = self.experience02.reward + self.gamma * self.experience03.reward * (1 - done)
+        reward_gt = self.experience01.reward + self.gamma * reward_01 * (1 - done)
+
+        self.assertEqual(reward, reward_gt)
+        self.assertEqual(next_state.all(), self.next_state_02.all())
+        self.assertEqual(self.experience03.done, done)
+
+    def test_sample_3_step(self):
+        """Test that final output of the 3 step sample is correct"""
+        self.buffer = MultiStepBuffer(capacity=10, n_steps=3, gamma=self.gamma)
+
+        self.buffer.append(self.experience01)
+        self.buffer.append(self.experience02)
+        self.buffer.append(self.experience02)
+
+        reward_gt = 1.71
+
+        batch = self.buffer.sample(1)
+
+        self.assertEqual(batch[0].all(), self.experience01.state.all())
+        self.assertEqual(batch[1], self.experience01.action)
+        self.assertEqual(batch[2], reward_gt)
+        self.assertEqual(batch[3], self.experience02.done)
+        self.assertEqual(batch[4].all(), self.experience02.new_state.all())
diff --git a/tests/models/rl/unit/test_reinforce.py b/tests/models/rl/unit/test_reinforce.py
new file mode 100644
index 0000000000..655dc2bd54
--- /dev/null
+++ b/tests/models/rl/unit/test_reinforce.py
@@ -0,0 +1,65 @@
+import argparse
+from unittest import TestCase
+
+import gym
+import numpy as np
+import torch
+
+from pl_bolts.datamodules.experience_source import DiscountedExperienceSource
+from pl_bolts.models.rl.common.agents import Agent
+from pl_bolts.models.rl.common.networks import MLP
+from pl_bolts.models.rl.common.gym_wrappers import ToTensor
+from pl_bolts.models.rl.reinforce_model import Reinforce
+
+
+class TestReinforce(TestCase):
+
+    def setUp(self) -> None:
+        self.env = ToTensor(gym.make("CartPole-v0"))
+        self.obs_shape = self.env.observation_space.shape
+        self.n_actions = self.env.action_space.n
+        self.net = MLP(self.obs_shape, self.n_actions)
+        self.agent = Agent(self.net)
+        self.exp_source = DiscountedExperienceSource(self.env, self.agent)
+
+        parent_parser = argparse.ArgumentParser(add_help=False)
+        parent_parser = Reinforce.add_model_specific_args(parent_parser)
+        args_list = [
+            "--env", "CartPole-v0",
+            "--batch_size", "32",
+            "--gamma", "0.99"
+        ]
+        self.hparams = parent_parser.parse_args(args_list)
+        self.model = Reinforce(**vars(self.hparams))
+
+        self.rl_dataloader = self.model.train_dataloader()
+
+    def test_loss(self):
+        """Test the reinforce loss function"""
+
+        batch_states = torch.rand(32, 4)
+        batch_actions = torch.rand(32).long()
+        batch_qvals = torch.rand(32)
+
+        loss = self.model.loss(batch_states, batch_actions, batch_qvals)
+
+        self.assertIsInstance(loss, torch.Tensor)
+
+    def test_get_qvals(self):
+        """Test that given an batch of episodes that it will return a list of qvals for each episode"""
+
+        batch_qvals = []
+        rewards = np.ones(32)
+        out = self.model.calc_qvals(rewards)
+        batch_qvals.append(out)
+
+        self.assertIsInstance(batch_qvals[0][0], float)
+        self.assertEqual(batch_qvals[0][0], (batch_qvals[0][1] * self.hparams.gamma) + 1.0)
+
+    def test_calc_q_vals(self):
+        rewards = np.ones(4)
+        gt_qvals = [3.9403989999999998, 2.9701, 1.99, 1.0]
+
+        qvals = self.model.calc_qvals(rewards)
+
+        self.assertEqual(gt_qvals, qvals)
diff --git a/tests/models/rl/unit/test_vpg.py b/tests/models/rl/unit/test_vpg.py
new file mode 100644
index 0000000000..0cbdb5a7c8
--- /dev/null
+++ b/tests/models/rl/unit/test_vpg.py
@@ -0,0 +1,56 @@
+import argparse
+from unittest import TestCase
+
+import gym
+import torch
+
+from pl_bolts.models.rl.common.agents import Agent
+from pl_bolts.models.rl.common.networks import MLP
+from pl_bolts.models.rl.common.gym_wrappers import ToTensor
+from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient
+
+
+class TestPolicyGradient(TestCase):
+
+    def setUp(self) -> None:
+        self.env = ToTensor(gym.make("CartPole-v0"))
+        self.obs_shape = self.env.observation_space.shape
+        self.n_actions = self.env.action_space.n
+        self.net = MLP(self.obs_shape, self.n_actions)
+        self.agent = Agent(self.net)
+
+        parent_parser = argparse.ArgumentParser(add_help=False)
+        parent_parser = VanillaPolicyGradient.add_model_specific_args(parent_parser)
+        args_list = [
+            "--env", "CartPole-v0",
+            "--batch_size", "32"
+        ]
+        self.hparams = parent_parser.parse_args(args_list)
+        self.model = VanillaPolicyGradient(**vars(self.hparams))
+
+    def test_loss(self):
+        """Test the reinforce loss function"""
+
+        batch_states = torch.rand(32, 4)
+        batch_actions = torch.rand(32).long()
+        batch_qvals = torch.rand(32)
+
+        loss = self.model.loss(batch_states, batch_actions, batch_qvals)
+
+        self.assertIsInstance(loss, torch.Tensor)
+
+    def test_train_batch(self):
+        """Tests that a single batch generates correctly"""
+
+        self.model.n_steps = 4
+        self.model.batch_size = 1
+        xp_dataloader = self.model.train_dataloader()
+
+        batch = next(iter(xp_dataloader))
+        self.assertEqual(len(batch), 3)
+        self.assertEqual(len(batch[0]), self.model.batch_size)
+        self.assertTrue(isinstance(batch, list))
+        self.assertIsInstance(batch[0], torch.Tensor)
+        self.assertIsInstance(batch[1], list)
+        self.assertIsInstance(batch[1][0], torch.Tensor)
+        self.assertIsInstance(batch[2], torch.Tensor)
diff --git a/tests/models/rl/unit/test_wrappers.py b/tests/models/rl/unit/test_wrappers.py
new file mode 100644
index 0000000000..31e84ada49
--- /dev/null
+++ b/tests/models/rl/unit/test_wrappers.py
@@ -0,0 +1,19 @@
+from unittest import TestCase
+
+import gym
+import torch
+
+from pl_bolts.models.rl.common.gym_wrappers import ToTensor
+
+
+class TestToTensor(TestCase):
+
+    def setUp(self) -> None:
+        self.env = ToTensor(gym.make("CartPole-v0"))
+
+    def test_wrapper(self):
+        state = self.env.reset()
+        self.assertIsInstance(state, torch.Tensor)
+
+        new_state, _, _, _ = self.env.step(1)
+        self.assertIsInstance(new_state, torch.Tensor)
diff --git a/tests/models/test_detection.py b/tests/models/test_detection.py
index 61edf2e875..a312fbc9d7 100644
--- a/tests/models/test_detection.py
+++ b/tests/models/test_detection.py
@@ -3,7 +3,7 @@
 import torch
 from torch.utils.data import DataLoader
 
-from pl_bolts.datamodules import DummyDetectionDataset
+from pl_bolts.datasets import DummyDetectionDataset
 from pl_bolts.models.detection import FasterRCNN
 
 
diff --git a/tests/models/test_mnist_templates.py b/tests/models/test_mnist_templates.py
index 7099212cb2..0c8867eb03 100644
--- a/tests/models/test_mnist_templates.py
+++ b/tests/models/test_mnist_templates.py
@@ -7,11 +7,11 @@
 def test_mnist(tmpdir):
     seed_everything()
 
-    model = LitMNIST(data_dir=tmpdir)
+    model = LitMNIST(data_dir=tmpdir, num_workers=0)
     trainer = pl.Trainer(limit_train_batches=0.01, limit_val_batches=0.01, max_epochs=1,
                          limit_test_batches=0.01, default_root_dir=tmpdir)
     trainer.fit(model)
     trainer.test(model)
-    loss = trainer.callback_metrics['loss']
+    loss = trainer.callback_metrics['train_loss']
 
-    assert loss <= 2.0, 'mnist failed'
+    assert loss <= 2.2, 'mnist failed'
diff --git a/tests/models/test_vision_models.py b/tests/models/test_vision_models.py
index 0455a76320..73af207f1a 100644
--- a/tests/models/test_vision_models.py
+++ b/tests/models/test_vision_models.py
@@ -4,6 +4,7 @@
 from pl_bolts.datamodules import MNISTDataModule, FashionMNISTDataModule
 from pl_bolts.models import GPT2, ImageGPT, UNet
 
+
 def test_igpt(tmpdir):
     pl.seed_everything(0)
     dm = MNISTDataModule(tmpdir, normalize=False)
@@ -53,4 +54,3 @@ def test_unet(tmpdir):
     model = UNet(num_classes=2)
     y = model(x)
     assert y.shape == torch.Size([10, 2, 28, 28])
-