From c10835c61796ded88e00c5bd46097e95feb95656 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 4 Dec 2020 22:42:53 +0100
Subject: [PATCH 01/16] WIP.

---
 rllib/agents/ppo/__init__.py                  |   2 +
 rllib/agents/ppo/ppo.py                       |   3 +
 rllib/agents/ppo/ppo_jax_policy.py            | 196 ++++++
 rllib/agents/ppo/ppo_torch_policy.py          |  12 +-
 rllib/agents/ppo/tests/test_ppo.py            |   8 +-
 rllib/models/jax/fcnet.py                     |   6 +-
 rllib/models/modelv2.py                       |  14 +-
 rllib/policy/__init__.py                      |   4 +
 rllib/policy/jax/__init__.py                  |   0
 rllib/policy/jax/jax_policy.py                | 603 ++++++++++++++++++
 rllib/policy/jax/jax_policy_template.py       | 370 +++++++++++
 rllib/policy/policy_template.py               | 402 ++++++++++++
 rllib/policy/torch_policy_template.py         |   3 +
 rllib/utils/__init__.py                       |  13 +
 rllib/utils/exploration/curiosity.py          |   3 +-
 .../utils/exploration/stochastic_sampling.py  |  22 +-
 rllib/utils/framework.py                      |   8 +-
 rllib/utils/jax_ops.py                        |  32 +
 rllib/utils/test_utils.py                     |   2 +-
 19 files changed, 1663 insertions(+), 40 deletions(-)
 create mode 100644 rllib/agents/ppo/ppo_jax_policy.py
 create mode 100644 rllib/policy/jax/__init__.py
 create mode 100644 rllib/policy/jax/jax_policy.py
 create mode 100644 rllib/policy/jax/jax_policy_template.py
 create mode 100644 rllib/policy/policy_template.py
 create mode 100644 rllib/utils/jax_ops.py

diff --git a/rllib/agents/ppo/__init__.py b/rllib/agents/ppo/__init__.py
index 1207737074e0..4d63ecdc6216 100644
--- a/rllib/agents/ppo/__init__.py
+++ b/rllib/agents/ppo/__init__.py
@@ -1,4 +1,5 @@
 from ray.rllib.agents.ppo.ppo import PPOTrainer, DEFAULT_CONFIG
+from ray.rllib.agents.ppo.ppo_jax_policy import PPOJAXPolicy
 from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
 from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
 from ray.rllib.agents.ppo.appo import APPOTrainer
@@ -8,6 +9,7 @@
     "APPOTrainer",
     "DDPPOTrainer",
     "DEFAULT_CONFIG",
+    "PPOJAXPolicy",
     "PPOTFPolicy",
     "PPOTorchPolicy",
     "PPOTrainer",
diff --git a/rllib/agents/ppo/ppo.py b/rllib/agents/ppo/ppo.py
index c8f6db43f9af..0b4a1ab93b33 100644
--- a/rllib/agents/ppo/ppo.py
+++ b/rllib/agents/ppo/ppo.py
@@ -158,6 +158,9 @@ def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]:
     if config["framework"] == "torch":
         from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
         return PPOTorchPolicy
+    elif config["framework"] == "jax":
+        from ray.rllib.agents.ppo.ppo_jax_policy import PPOJAXPolicy
+        return PPOJAXPolicy
 
 
 class UpdateKL:
diff --git a/rllib/agents/ppo/ppo_jax_policy.py b/rllib/agents/ppo/ppo_jax_policy.py
new file mode 100644
index 000000000000..99c928b8e53d
--- /dev/null
+++ b/rllib/agents/ppo/ppo_jax_policy.py
@@ -0,0 +1,196 @@
+"""
+JAX policy class used for PPO.
+"""
+import gym
+import logging
+import numpy as np
+from typing import List, Type, Union
+
+import ray
+from ray.rllib.agents.a3c.a3c_torch_policy import apply_grad_clipping
+from ray.rllib.agents.ppo.ppo_tf_policy import postprocess_ppo_gae, \
+    setup_config
+from ray.rllib.agents.ppo.ppo_torch_policy import vf_preds_fetches, \
+    KLCoeffMixin, kl_and_loss_stats
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_policy import EntropyCoeffSchedule, \
+    LearningRateSchedule
+from ray.rllib.utils.framework import try_import_jax
+from ray.rllib.utils.jax_ops import explained_variance, sequence_mask
+from ray.rllib.utils.typing import TensorType, TrainerConfigDict
+
+jax, flax = try_import_jax()
+jnp = None
+if jax:
+    import jax.numpy as jnp
+
+logger = logging.getLogger(__name__)
+
+
+def ppo_surrogate_loss(
+        policy: Policy, model: ModelV2,
+        dist_class: Type[TorchDistributionWrapper],
+        train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]:
+    """Constructs the loss for Proximal Policy Objective.
+
+    Args:
+        policy (Policy): The Policy to calculate the loss for.
+        model (ModelV2): The Model to calculate the loss for.
+        dist_class (Type[ActionDistribution]: The action distr. class.
+        train_batch (SampleBatch): The training data.
+
+    Returns:
+        Union[TensorType, List[TensorType]]: A single loss tensor or a list
+            of loss tensors.
+    """
+    logits, state = model.from_batch(train_batch, is_training=True)
+    curr_action_dist = dist_class(logits, model)
+
+    # RNN case: Mask away 0-padded chunks at end of time axis.
+    if state:
+        max_seq_len = jnp.maximum(train_batch["seq_lens"])
+        mask = sequence_mask(
+            train_batch["seq_lens"],
+            max_seq_len,
+            time_major=model.is_time_major())
+        mask = jnp.reshape(mask, [-1])
+        num_valid = jnp.sum(mask)
+
+        def reduce_mean_valid(t):
+            return jnp.sum(t[mask]) / num_valid
+
+    # non-RNN case: No masking.
+    else:
+        mask = None
+        reduce_mean_valid = jnp.mean
+
+    prev_action_dist = dist_class(train_batch[SampleBatch.ACTION_DIST_INPUTS],
+                                  model)
+
+    logp_ratio = jnp.exp(
+        curr_action_dist.logp(train_batch[SampleBatch.ACTIONS]) -
+        train_batch[SampleBatch.ACTION_LOGP])
+    action_kl = prev_action_dist.kl(curr_action_dist)
+    mean_kl = reduce_mean_valid(action_kl)
+
+    curr_entropy = curr_action_dist.entropy()
+    mean_entropy = reduce_mean_valid(curr_entropy)
+
+    surrogate_loss = jnp.minimum(
+        train_batch[Postprocessing.ADVANTAGES] * logp_ratio,
+        train_batch[Postprocessing.ADVANTAGES] * jnp.clip(
+            logp_ratio, 1 - policy.config["clip_param"],
+            1 + policy.config["clip_param"]))
+    mean_policy_loss = reduce_mean_valid(-surrogate_loss)
+
+    if policy.config["use_gae"]:
+        prev_value_fn_out = train_batch[SampleBatch.VF_PREDS]
+        value_fn_out = model.value_function()
+        vf_loss1 = jnp.square(
+            value_fn_out - train_batch[Postprocessing.VALUE_TARGETS])
+        vf_clipped = prev_value_fn_out + jnp.clip(
+            value_fn_out - prev_value_fn_out, -policy.config["vf_clip_param"],
+            policy.config["vf_clip_param"])
+        vf_loss2 = jnp.square(
+            vf_clipped - train_batch[Postprocessing.VALUE_TARGETS])
+        vf_loss = jnp.maximum(vf_loss1, vf_loss2)
+        mean_vf_loss = reduce_mean_valid(vf_loss)
+        total_loss = reduce_mean_valid(
+            -surrogate_loss + policy.kl_coeff * action_kl +
+            policy.config["vf_loss_coeff"] * vf_loss -
+            policy.entropy_coeff * curr_entropy)
+    else:
+        mean_vf_loss = 0.0
+        total_loss = reduce_mean_valid(-surrogate_loss +
+                                       policy.kl_coeff * action_kl -
+                                       policy.entropy_coeff * curr_entropy)
+
+    # Store stats in policy for stats_fn.
+    policy._total_loss = total_loss
+    policy._mean_policy_loss = mean_policy_loss
+    policy._mean_vf_loss = mean_vf_loss
+    policy._vf_explained_var = explained_variance(
+            train_batch[Postprocessing.VALUE_TARGETS],
+            policy.model.value_function())
+    policy._mean_entropy = mean_entropy
+    policy._mean_kl = mean_kl
+
+    return total_loss
+
+
+class ValueNetworkMixin:
+    """Assigns the `_value()` method to the PPOPolicy.
+
+    This way, Policy can call `_value()` to get the current VF estimate on a
+    single(!) observation (as done in `postprocess_trajectory_fn`).
+    Note: When doing this, an actual forward pass is being performed.
+    This is different from only calling `model.value_function()`, where
+    the result of the most recent forward pass is being used to return an
+    already calculated tensor.
+    """
+
+    def __init__(self, obs_space, action_space, config):
+        # When doing GAE, we need the value function estimate on the
+        # observation.
+        if config["use_gae"]:
+
+            def value(ob, prev_action, prev_reward, *state):
+                model_out, _ = self.model({
+                    SampleBatch.CUR_OBS: jnp.asarray([ob]),
+                    SampleBatch.PREV_ACTIONS: jnp.asarray([prev_action]),
+                    SampleBatch.PREV_REWARDS: jnp.asarray([prev_reward]),
+                    "is_training": False,
+                }, [jnp.asarray([s]) for s in state], jnp.asarray([1]))
+                # [0] = remove the batch dim.
+                return self.model.value_function()[0]
+
+        # When not doing GAE, we do not require the value function's output.
+        else:
+
+            def value(ob, prev_action, prev_reward, *state):
+                return 0.0
+
+        self._value = value
+
+
+def setup_mixins(policy: Policy, obs_space: gym.spaces.Space,
+                 action_space: gym.spaces.Space,
+                 config: TrainerConfigDict) -> None:
+    """Call all mixin classes' constructors before PPOPolicy initialization.
+
+    Args:
+        policy (Policy): The Policy object.
+        obs_space (gym.spaces.Space): The Policy's observation space.
+        action_space (gym.spaces.Space): The Policy's action space.
+        config (TrainerConfigDict): The Policy's config.
+    """
+    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
+    KLCoeffMixin.__init__(policy, config)
+    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
+                                  config["entropy_coeff_schedule"])
+    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+
+
+# Build a child class of `JAXPolicy`, given the custom functions defined
+# above.
+PPOJAXPolicy = build_policy_class(
+    name="PPOJAXPolicy",
+    framework="jax",
+    get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
+    loss_fn=ppo_surrogate_loss,
+    stats_fn=kl_and_loss_stats,
+    extra_action_out_fn=vf_preds_fetches,
+    postprocess_fn=postprocess_ppo_gae,
+    extra_grad_process_fn=apply_grad_clipping,
+    before_init=setup_config,
+    before_loss_init=setup_mixins,
+    mixins=[
+        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
+        ValueNetworkMixin
+    ],
+)
diff --git a/rllib/agents/ppo/ppo_torch_policy.py b/rllib/agents/ppo/ppo_torch_policy.py
index 58637fa0a64b..a21930d0ae8c 100644
--- a/rllib/agents/ppo/ppo_torch_policy.py
+++ b/rllib/agents/ppo/ppo_torch_policy.py
@@ -14,10 +14,10 @@
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
 from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.torch_policy import EntropyCoeffSchedule, \
     LearningRateSchedule
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.torch_ops import convert_to_torch_tensor, \
     explained_variance, sequence_mask
@@ -110,6 +110,9 @@ def reduce_mean_valid(t):
     policy._total_loss = total_loss
     policy._mean_policy_loss = mean_policy_loss
     policy._mean_vf_loss = mean_vf_loss
+    policy._vf_explained_var = explained_variance(
+            train_batch[Postprocessing.VALUE_TARGETS],
+            policy.model.value_function())
     policy._mean_entropy = mean_entropy
     policy._mean_kl = mean_kl
 
@@ -133,9 +136,7 @@ def kl_and_loss_stats(policy: Policy,
         "total_loss": policy._total_loss,
         "policy_loss": policy._mean_policy_loss,
         "vf_loss": policy._mean_vf_loss,
-        "vf_explained_var": explained_variance(
-            train_batch[Postprocessing.VALUE_TARGETS],
-            policy.model.value_function()),
+        "vf_explained_var": policy._vf_explained_var,
         "kl": policy._mean_kl,
         "entropy": policy._mean_entropy,
         "entropy_coeff": policy.entropy_coeff,
@@ -256,8 +257,9 @@ def setup_mixins(policy: Policy, obs_space: gym.spaces.Space,
 
 # Build a child class of `TorchPolicy`, given the custom functions defined
 # above.
-PPOTorchPolicy = build_torch_policy(
+PPOTorchPolicy = build_policy_class(
     name="PPOTorchPolicy",
+    framework="torch",
     get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
     loss_fn=ppo_surrogate_loss,
     stats_fn=kl_and_loss_stats,
diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
index c00cd36ba475..f6f99732765d 100644
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@@ -64,7 +64,7 @@ def on_train_result(self, *, trainer, result: dict, **kwargs):
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init()
+        ray.init(local_mode=True)#TODO
 
     @classmethod
     def tearDownClass(cls):
@@ -84,10 +84,10 @@ def test_ppo_compilation_and_lr_schedule(self):
         config["train_batch_size"] = 128
         num_iterations = 2
 
-        for _ in framework_iterator(config):
-            for env in ["CartPole-v0", "MsPacmanNoFrameskip-v4"]:
+        for _ in framework_iterator(config, frameworks="jax"):#TODO
+            for env in ["CartPole-v0"]:#, "MsPacmanNoFrameskip-v4"]:
                 print("Env={}".format(env))
-                for lstm in [True, False]:
+                for lstm in [False]:#True, False]:
                     print("LSTM={}".format(lstm))
                     config["model"]["use_lstm"] = lstm
                     config["model"]["lstm_use_prev_action"] = lstm
diff --git a/rllib/models/jax/fcnet.py b/rllib/models/jax/fcnet.py
index 1cec5eb5e8a6..c2e0d50eda6c 100644
--- a/rllib/models/jax/fcnet.py
+++ b/rllib/models/jax/fcnet.py
@@ -119,7 +119,9 @@ def forward(self, input_dict, state, seq_lens):
     def value_function(self):
         assert self._features is not None, "must call forward() first"
         if self._value_branch_separate:
-            return self._value_branch(
-                self._value_branch_separate(self._last_flat_in)).squeeze(1)
+            x = self._last_flat_in
+            for layer in self._value_branch_separate:
+                x = layer(x)
+            return self._value_branch(x).squeeze(1)
         else:
             return self._value_branch(self._features).squeeze(1)
diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py
index 21bc139d4d6c..79d3d23ae1e4 100644
--- a/rllib/models/modelv2.py
+++ b/rllib/models/modelv2.py
@@ -9,6 +9,7 @@
 from ray.rllib.models.repeated_values import RepeatedValues
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils import NullContextManager
 from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI
 from ray.rllib.utils.framework import try_import_tf, try_import_torch, \
     TensorType
@@ -316,19 +317,6 @@ def is_time_major(self) -> bool:
         return self.time_major is True
 
 
-class NullContextManager:
-    """No-op context manager"""
-
-    def __init__(self):
-        pass
-
-    def __enter__(self):
-        pass
-
-    def __exit__(self, *args):
-        pass
-
-
 @DeveloperAPI
 def flatten(obs: TensorType, framework: str) -> TensorType:
     """Flatten the given tensor."""
diff --git a/rllib/policy/__init__.py b/rllib/policy/__init__.py
index 348fe187da4c..ded33e1cac5f 100644
--- a/rllib/policy/__init__.py
+++ b/rllib/policy/__init__.py
@@ -1,13 +1,17 @@
 from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.jax.jax_policy import JAXPolicy
 from ray.rllib.policy.torch_policy import TorchPolicy
 from ray.rllib.policy.tf_policy import TFPolicy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 
 __all__ = [
     "Policy",
+    "JAXPolicy",
     "TFPolicy",
     "TorchPolicy",
+    "build_policy_class",
     "build_tf_policy",
     "build_torch_policy",
 ]
diff --git a/rllib/policy/jax/__init__.py b/rllib/policy/jax/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rllib/policy/jax/jax_policy.py b/rllib/policy/jax/jax_policy.py
new file mode 100644
index 000000000000..fa61cf54e82d
--- /dev/null
+++ b/rllib/policy/jax/jax_policy.py
@@ -0,0 +1,603 @@
+import functools
+import gym
+import numpy as np
+import logging
+import time
+from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.jax.jax_modelv2 import JAXModelV2
+from ray.rllib.models.jax.jax_action_dist import JAXDistribution
+from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.rnn_sequencing import pad_batch_to_sequences_of_same_size
+from ray.rllib.utils import force_list
+from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.framework import try_import_jax
+from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
+from ray.rllib.utils.torch_ops import convert_to_non_torch_type, \
+    convert_to_torch_tensor
+from ray.rllib.utils.tracking_dict import UsageTrackingDict
+from ray.rllib.utils.typing import ModelGradients, ModelWeights, \
+    TensorType, TrainerConfigDict
+
+jax, flax = try_import_jax()
+if jax:
+    import jax.numpy as jnp
+
+logger = logging.getLogger(__name__)
+
+
+@DeveloperAPI
+class JAXPolicy(Policy):
+    """Template for a JAX policy and loss to use with RLlib.
+    """
+
+    @DeveloperAPI
+    def __init__(
+            self,
+            observation_space: gym.spaces.Space,
+            action_space: gym.spaces.Space,
+            config: TrainerConfigDict,
+            *,
+            model: ModelV2,
+            loss: Callable[[
+                Policy, ModelV2, Type[JAXDistribution], SampleBatch
+            ], Union[TensorType, List[TensorType]]],
+            action_distribution_class: Type[JAXDistribution],
+            action_sampler_fn: Optional[Callable[[
+                TensorType, List[TensorType]
+            ], Tuple[TensorType, TensorType]]] = None,
+            action_distribution_fn: Optional[Callable[[
+                Policy, ModelV2, TensorType, TensorType, TensorType
+            ], Tuple[TensorType, Type[JAXDistribution], List[
+                TensorType]]]] = None,
+            max_seq_len: int = 20,
+            get_batch_divisibility_req: Optional[Callable[[Policy],
+                                                          int]] = None,
+    ):
+        """Initializes a JAXPolicy instance.
+
+        Args:
+            observation_space (gym.spaces.Space): Observation space of the
+                policy.
+            action_space (gym.spaces.Space): Action space of the policy.
+            config (TrainerConfigDict): The Policy config dict.
+            model (ModelV2): PyTorch policy module. Given observations as
+                input, this module must return a list of outputs where the
+                first item is action logits, and the rest can be any value.
+            loss (Callable[[Policy, ModelV2, Type[JAXDistribution],
+                SampleBatch], Union[TensorType, List[TensorType]]]): Callable
+                that returns a single scalar loss or a list of loss terms.
+            action_distribution_class (Type[JAXDistribution]): Class
+                for a torch action distribution.
+            action_sampler_fn (Callable[[TensorType, List[TensorType]],
+                Tuple[TensorType, TensorType]]): A callable returning a
+                sampled action and its log-likelihood given Policy, ModelV2,
+                input_dict, explore, timestep, and is_training.
+            action_distribution_fn (Optional[Callable[[Policy, ModelV2,
+                Dict[str, TensorType], TensorType, TensorType],
+                Tuple[TensorType, type, List[TensorType]]]]): A callable
+                returning distribution inputs (parameters), a dist-class to
+                generate an action distribution object from, and
+                internal-state outputs (or an empty list if not applicable).
+                Note: No Exploration hooks have to be called from within
+                `action_distribution_fn`. It's should only perform a simple
+                forward pass through some model.
+                If None, pass inputs through `self.model()` to get distribution
+                inputs.
+                The callable takes as inputs: Policy, ModelV2, input_dict,
+                explore, timestep, is_training.
+            max_seq_len (int): Max sequence length for LSTM training.
+            get_batch_divisibility_req (Optional[Callable[[Policy], int]]]):
+                Optional callable that returns the divisibility requirement
+                for sample batches given the Policy.
+        """
+        self.framework = "jax"
+        super().__init__(observation_space, action_space, config)
+        #if torch.cuda.is_available():
+        #    logger.info("TorchPolicy running on GPU.")
+        #    self.device = torch.device("cuda")
+        #else:
+        #    logger.info("TorchPolicy running on CPU.")
+        #    self.device = torch.device("cpu")
+        self.model = model#.to(self.device)
+        # Auto-update model's inference view requirements, if recurrent.
+        self._update_model_inference_view_requirements_from_init_state()
+        # Combine view_requirements for Model and Policy.
+        self.view_requirements.update(self.model.inference_view_requirements)
+
+        self.exploration = self._create_exploration()
+        self.unwrapped_model = model  # used to support DistributedDataParallel
+        self._loss = loss
+        self._optimizers = force_list(self.optimizer())
+
+        self.dist_class = action_distribution_class
+        self.action_sampler_fn = action_sampler_fn
+        self.action_distribution_fn = action_distribution_fn
+
+        # If set, means we are using distributed allreduce during learning.
+        self.distributed_world_size = None
+
+        self.max_seq_len = max_seq_len
+        self.batch_divisibility_req = get_batch_divisibility_req(self) if \
+            callable(get_batch_divisibility_req) else \
+            (get_batch_divisibility_req or 1)
+
+    def compute_actions(
+            self,
+            obs_batch: Union[List[TensorType], TensorType],
+            state_batches: Optional[List[TensorType]] = None,
+            prev_action_batch: Union[List[TensorType], TensorType] = None,
+            prev_reward_batch: Union[List[TensorType], TensorType] = None,
+            info_batch: Optional[Dict[str, list]] = None,
+            episodes: Optional[List["MultiAgentEpisode"]] = None,
+            explore: Optional[bool] = None,
+            timestep: Optional[int] = None,
+            **kwargs) -> \
+            Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
+        raise NotImplementedError
+
+    @override(Policy)
+    def compute_actions_from_input_dict(
+            self,
+            input_dict: Dict[str, TensorType],
+            explore: bool = None,
+            timestep: Optional[int] = None,
+            **kwargs) -> \
+            Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
+
+        explore = explore if explore is not None else self.config["explore"]
+        timestep = timestep if timestep is not None else self.global_timestep
+
+        # Pack internal state inputs into (separate) list.
+        state_batches = [
+            input_dict[k] for k in input_dict.keys() if "state_in" in k[:8]
+        ]
+        # Calculate RNN sequence lengths.
+        seq_lens = np.array([1] * len(input_dict["obs"])) \
+            if state_batches else None
+
+        if self.action_sampler_fn:
+            action_dist = dist_inputs = None
+            state_out = state_batches
+            actions, logp, state_out = self.action_sampler_fn(
+                self,
+                self.model,
+                input_dict,
+                state_out,
+                explore=explore,
+                timestep=timestep)
+        else:
+            # Call the exploration before_compute_actions hook.
+            self.exploration.before_compute_actions(
+                explore=explore, timestep=timestep)
+            if self.action_distribution_fn:
+                dist_inputs, dist_class, state_out = \
+                    self.action_distribution_fn(
+                        self,
+                        self.model,
+                        input_dict[SampleBatch.CUR_OBS],
+                        explore=explore,
+                        timestep=timestep,
+                        is_training=False)
+            else:
+                dist_class = self.dist_class
+                dist_inputs, state_out = self.model(input_dict, state_batches,
+                                                    seq_lens)
+
+            if not (isinstance(dist_class, functools.partial)
+                    or issubclass(dist_class, JAXDistribution)):
+                raise ValueError(
+                    "`dist_class` ({}) not a JAXDistribution "
+                    "subclass! Make sure your `action_distribution_fn` or "
+                    "`make_model_and_action_dist` return a correct "
+                    "distribution class.".format(dist_class.__name__))
+            action_dist = dist_class(dist_inputs, self.model)
+
+            # Get the exploration action from the forward results.
+            actions, logp = \
+                self.exploration.get_exploration_action(
+                    action_distribution=action_dist,
+                    timestep=timestep,
+                    explore=explore)
+
+        input_dict[SampleBatch.ACTIONS] = actions
+
+        # Add default and custom fetches.
+        extra_fetches = self.extra_action_out(input_dict, state_batches,
+                                              self.model, action_dist)
+
+        # Action-dist inputs.
+        if dist_inputs is not None:
+            extra_fetches[SampleBatch.ACTION_DIST_INPUTS] = dist_inputs
+
+        # Action-logp and action-prob.
+        if logp is not None:
+            extra_fetches[SampleBatch.ACTION_PROB] = \
+                jnp.exp(logp.astype(jnp.float32))
+            extra_fetches[SampleBatch.ACTION_LOGP] = logp
+
+        # Update our global timestep by the batch size.
+        self.global_timestep += len(input_dict[SampleBatch.CUR_OBS])
+
+        return actions, state_out, extra_fetches
+
+    @override(Policy)
+    @DeveloperAPI
+    def compute_log_likelihoods(
+            self,
+            actions: Union[List[TensorType], TensorType],
+            obs_batch: Union[List[TensorType], TensorType],
+            state_batches: Optional[List[TensorType]] = None,
+            prev_action_batch: Optional[Union[List[TensorType],
+                                              TensorType]] = None,
+            prev_reward_batch: Optional[Union[List[
+                TensorType], TensorType]] = None) -> TensorType:
+
+        if self.action_sampler_fn and self.action_distribution_fn is None:
+            raise ValueError("Cannot compute log-prob/likelihood w/o an "
+                             "`action_distribution_fn` and a provided "
+                             "`action_sampler_fn`!")
+
+        input_dict = {
+            SampleBatch.CUR_OBS: obs_batch,
+            SampleBatch.ACTIONS: actions
+        }
+        if prev_action_batch is not None:
+            input_dict[SampleBatch.PREV_ACTIONS] = prev_action_batch
+        if prev_reward_batch is not None:
+            input_dict[SampleBatch.PREV_REWARDS] = prev_reward_batch
+        seq_lens = jnp.ones(len(obs_batch), dtype=jnp.int32)
+        state_batches = [s for s in (state_batches or [])]
+
+        # Exploration hook before each forward pass.
+        self.exploration.before_compute_actions(explore=False)
+
+        # Action dist class and inputs are generated via custom function.
+        if self.action_distribution_fn:
+            dist_inputs, dist_class, _ = self.action_distribution_fn(
+                policy=self,
+                model=self.model,
+                obs_batch=input_dict[SampleBatch.CUR_OBS],
+                explore=False,
+                is_training=False)
+        # Default action-dist inputs calculation.
+        else:
+            dist_class = self.dist_class
+            dist_inputs, _ = self.model(input_dict, state_batches,
+                                        seq_lens)
+
+        action_dist = dist_class(dist_inputs, self.model)
+        log_likelihoods = action_dist.logp(input_dict[SampleBatch.ACTIONS])
+        return log_likelihoods
+
+    @override(Policy)
+    @DeveloperAPI
+    def learn_on_batch(
+            self, postprocessed_batch: SampleBatch) -> Dict[str, TensorType]:
+        # Callback handling.
+        self.callbacks.on_learn_on_batch(
+            policy=self, train_batch=postprocessed_batch)
+
+        # Compute gradients (will calculate all losses and `backward()`
+        # them to get the grads).
+        grads, fetches = self.compute_gradients(postprocessed_batch)
+
+        # Step the optimizer(s).
+        for i, opt in enumerate(self._optimizers):
+            opt.step()
+
+        if self.model:
+            fetches["model"] = self.model.metrics()
+        return fetches
+
+    @override(Policy)
+    @DeveloperAPI
+    def compute_gradients(self,
+                          postprocessed_batch: SampleBatch) -> ModelGradients:
+        # Get batch ready for RNNs, if applicable.
+        pad_batch_to_sequences_of_same_size(
+            postprocessed_batch,
+            max_seq_len=self.max_seq_len,
+            shuffle=False,
+            batch_divisibility_req=self.batch_divisibility_req,
+        )
+
+        train_batch = self._lazy_tensor_dict(postprocessed_batch)
+
+        # Calculate the actual policy loss.
+        loss_out = force_list(
+            self._loss(self, self.model, self.dist_class, train_batch))
+
+        # Call Model's custom-loss with Policy loss outputs and train_batch.
+        if self.model:
+            loss_out = self.model.custom_loss(loss_out, train_batch)
+
+        # Give Exploration component that chance to modify the loss (or add
+        # its own terms).
+        if hasattr(self, "exploration"):
+            loss_out = self.exploration.get_exploration_loss(
+                loss_out, train_batch)
+
+        assert len(loss_out) == len(self._optimizers)
+
+        # assert not any(torch.isnan(l) for l in loss_out)
+        fetches = self.extra_compute_grad_fetches()
+
+        # Loop through all optimizers.
+        grad_info = {"allreduce_latency": 0.0}
+
+        all_grads = []
+        for i, opt in enumerate(self._optimizers):
+            # Erase gradients in all vars of this optimizer.
+            opt.zero_grad()
+            # Recompute gradients of loss over all variables.
+            loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1))
+            grad_info.update(self.extra_grad_process(opt, loss_out[i]))
+
+            grads = []
+            # Note that return values are just references;
+            # Calling zero_grad would modify the values.
+            for param_group in opt.param_groups:
+                for p in param_group["params"]:
+                    if p.grad is not None:
+                        grads.append(p.grad)
+                        all_grads.append(p.grad.data.cpu().numpy())
+                    else:
+                        all_grads.append(None)
+
+            if self.distributed_world_size:
+                start = time.time()
+                if torch.cuda.is_available():
+                    # Sadly, allreduce_coalesced does not work with CUDA yet.
+                    for g in grads:
+                        torch.distributed.all_reduce(
+                            g, op=torch.distributed.ReduceOp.SUM)
+                else:
+                    torch.distributed.all_reduce_coalesced(
+                        grads, op=torch.distributed.ReduceOp.SUM)
+
+                for param_group in opt.param_groups:
+                    for p in param_group["params"]:
+                        if p.grad is not None:
+                            p.grad /= self.distributed_world_size
+
+                grad_info["allreduce_latency"] += time.time() - start
+
+        grad_info["allreduce_latency"] /= len(self._optimizers)
+        grad_info.update(self.extra_grad_info(train_batch))
+
+        return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
+
+    @override(Policy)
+    @DeveloperAPI
+    def apply_gradients(self, gradients: ModelGradients) -> None:
+        # TODO(sven): Not supported for multiple optimizers yet.
+        assert len(self._optimizers) == 1
+        for g, p in zip(gradients, self.model.parameters()):
+            if g is not None:
+                p.grad = torch.from_numpy(g).to(self.device)
+
+        self._optimizers[0].step()
+
+    @override(Policy)
+    @DeveloperAPI
+    def get_weights(self) -> ModelWeights:
+        return {
+            k: v.cpu().detach().numpy()
+            for k, v in self.model.state_dict().items()
+        }
+
+    @override(Policy)
+    @DeveloperAPI
+    def set_weights(self, weights: ModelWeights) -> None:
+        weights = convert_to_torch_tensor(weights, device=self.device)
+        self.model.load_state_dict(weights)
+
+    @override(Policy)
+    @DeveloperAPI
+    def is_recurrent(self) -> bool:
+        return len(self.model.get_initial_state()) > 0
+
+    @override(Policy)
+    @DeveloperAPI
+    def num_state_tensors(self) -> int:
+        return len(self.model.get_initial_state())
+
+    @override(Policy)
+    @DeveloperAPI
+    def get_initial_state(self) -> List[TensorType]:
+        return [
+            s.detach().cpu().numpy() for s in self.model.get_initial_state()
+        ]
+
+    @override(Policy)
+    @DeveloperAPI
+    def get_state(self) -> Union[Dict[str, TensorType], List[TensorType]]:
+        state = super().get_state()
+        state["_optimizer_variables"] = []
+        for i, o in enumerate(self._optimizers):
+            optim_state_dict = convert_to_non_torch_type(o.state_dict())
+            state["_optimizer_variables"].append(optim_state_dict)
+        return state
+
+    @override(Policy)
+    @DeveloperAPI
+    def set_state(self, state: object) -> None:
+        state = state.copy()  # shallow copy
+        # Set optimizer vars first.
+        optimizer_vars = state.pop("_optimizer_variables", None)
+        if optimizer_vars:
+            assert len(optimizer_vars) == len(self._optimizers)
+            for o, s in zip(self._optimizers, optimizer_vars):
+                optim_state_dict = convert_to_torch_tensor(
+                    s, device=self.device)
+                o.load_state_dict(optim_state_dict)
+        # Then the Policy's (NN) weights.
+        super().set_state(state)
+
+    @DeveloperAPI
+    def extra_grad_process(self, optimizer: "torch.optim.Optimizer",
+                           loss: TensorType):
+        """Called after each optimizer.zero_grad() + loss.backward() call.
+
+        Called for each self._optimizers/loss-value pair.
+        Allows for gradient processing before optimizer.step() is called.
+        E.g. for gradient clipping.
+
+        Args:
+            optimizer (torch.optim.Optimizer): A torch optimizer object.
+            loss (TensorType): The loss tensor associated with the optimizer.
+
+        Returns:
+            Dict[str, TensorType]: An dict with information on the gradient
+                processing step.
+        """
+        return {}
+
+    @DeveloperAPI
+    def extra_compute_grad_fetches(self) -> Dict[str, any]:
+        """Extra values to fetch and return from compute_gradients().
+
+        Returns:
+            Dict[str, any]: Extra fetch dict to be added to the fetch dict
+                of the compute_gradients call.
+        """
+        return {LEARNER_STATS_KEY: {}}  # e.g, stats, td error, etc.
+
+    @DeveloperAPI
+    def extra_action_out(
+            self, input_dict: Dict[str, TensorType],
+            state_batches: List[TensorType], model: JAXModelV2,
+            action_dist: JAXDistribution) -> Dict[str, TensorType]:
+        """Returns dict of extra info to include in experience batch.
+
+        Args:
+            input_dict (Dict[str, TensorType]): Dict of model input tensors.
+            state_batches (List[TensorType]): List of state tensors.
+            model (JAXModelV2): Reference to the model object.
+            action_dist (JAXDistribution): Torch action dist object
+                to get log-probs (e.g. for already sampled actions).
+
+        Returns:
+            Dict[str, TensorType]: Extra outputs to return in a
+                compute_actions() call (3rd return value).
+        """
+        return {}
+
+    @DeveloperAPI
+    def extra_grad_info(self,
+                        train_batch: SampleBatch) -> Dict[str, TensorType]:
+        """Return dict of extra grad info.
+
+        Args:
+            train_batch (SampleBatch): The training batch for which to produce
+                extra grad info for.
+
+        Returns:
+            Dict[str, TensorType]: The info dict carrying grad info per str
+                key.
+        """
+        return {}
+
+    @DeveloperAPI
+    def optimizer(
+            self
+    ) -> Union[List["flax.optim.Optimizer"], "flax.optim.Optimizer"]:
+        """Custom the local FLAX optimizer(s) to use.
+
+        Returns:
+            Union[List[flax.optim.Optimizer], flax.optim.Optimizer]:
+                The local FLAX optimizer(s) to use for this Policy.
+        """
+        if hasattr(self, "config"):
+            return flax.optim.Adam(learning_rate=self.config["lr"])
+        else:
+            return flax.optim.Adam()
+
+    @override(Policy)
+    @DeveloperAPI
+    def export_model(self, export_dir: str) -> None:
+        """TODO(sven): implement for JAX.
+        """
+        raise NotImplementedError
+
+    @override(Policy)
+    @DeveloperAPI
+    def export_checkpoint(self, export_dir: str) -> None:
+        """TODO(sven): implement for JAX.
+        """
+        raise NotImplementedError
+
+    @override(Policy)
+    @DeveloperAPI
+    def import_model_from_h5(self, import_file: str) -> None:
+        """Imports weights into JAX model."""
+        return self.model.import_from_h5(import_file)
+
+    def _lazy_tensor_dict(self, data):
+        tensor_dict = UsageTrackingDict(data)
+        return tensor_dict
+
+    #def _lazy_numpy_dict(self, postprocessed_batch):
+    #    train_batch = UsageTrackingDict(postprocessed_batch)
+    #    train_batch.set_get_interceptor(
+    #        functools.partial(convert_to_non_torch_type))
+    #    return train_batch
+
+
+# TODO: (sven) Unify hyperparam annealing procedures across RLlib (tf/torch)
+#   and for all possible hyperparams, not just lr.
+@DeveloperAPI
+class LearningRateSchedule:
+    """Mixin for TFPolicy that adds a learning rate schedule."""
+
+    @DeveloperAPI
+    def __init__(self, lr, lr_schedule):
+        self.cur_lr = lr
+        if lr_schedule is None:
+            self.lr_schedule = ConstantSchedule(lr, framework=None)
+        else:
+            self.lr_schedule = PiecewiseSchedule(
+                lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
+
+    @override(Policy)
+    def on_global_var_update(self, global_vars):
+        super().on_global_var_update(global_vars)
+        self.cur_lr = self.lr_schedule.value(global_vars["timestep"])
+        for opt in self._optimizers:
+            for p in opt.param_groups:
+                p["lr"] = self.cur_lr
+
+
+@DeveloperAPI
+class EntropyCoeffSchedule:
+    """Mixin for TorchPolicy that adds entropy coeff decay."""
+
+    @DeveloperAPI
+    def __init__(self, entropy_coeff, entropy_coeff_schedule):
+        self.entropy_coeff = entropy_coeff
+
+        if entropy_coeff_schedule is None:
+            self.entropy_coeff_schedule = ConstantSchedule(
+                entropy_coeff, framework=None)
+        else:
+            # Allows for custom schedule similar to lr_schedule format
+            if isinstance(entropy_coeff_schedule, list):
+                self.entropy_coeff_schedule = PiecewiseSchedule(
+                    entropy_coeff_schedule,
+                    outside_value=entropy_coeff_schedule[-1][-1],
+                    framework=None)
+            else:
+                # Implements previous version but enforces outside_value
+                self.entropy_coeff_schedule = PiecewiseSchedule(
+                    [[0, entropy_coeff], [entropy_coeff_schedule, 0.0]],
+                    outside_value=0.0,
+                    framework=None)
+
+    @override(Policy)
+    def on_global_var_update(self, global_vars):
+        super(EntropyCoeffSchedule, self).on_global_var_update(global_vars)
+        self.entropy_coeff = self.entropy_coeff_schedule.value(
+            global_vars["timestep"])
diff --git a/rllib/policy/jax/jax_policy_template.py b/rllib/policy/jax/jax_policy_template.py
new file mode 100644
index 000000000000..3d7b03703ee9
--- /dev/null
+++ b/rllib/policy/jax/jax_policy_template.py
@@ -0,0 +1,370 @@
+import gym
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.jax.jax_action_dist import JAXDistribution
+from ray.rllib.models.jax.jax_modelv2 import JAXModelV2
+from ray.rllib.policy.jax.jax_policy import JAXPolicy
+from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils import add_mixins, force_list
+from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.framework import try_import_jax
+from ray.rllib.utils.torch_ops import convert_to_non_torch_type
+from ray.rllib.utils.typing import TensorType, TrainerConfigDict
+
+jax, _ = try_import_jax()
+
+
+@DeveloperAPI
+def build_jax_policy_class(
+        name: str,
+        *,
+        loss_fn: Optional[Callable[[
+            Policy, ModelV2, Type[JAXDistribution], SampleBatch
+        ], Union[TensorType, List[TensorType]]]],
+        get_default_config: Optional[Callable[[], TrainerConfigDict]] = None,
+        stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[
+            str, TensorType]]] = None,
+        postprocess_fn: Optional[Callable[[
+            Policy, SampleBatch, Optional[Dict[Any, SampleBatch]], Optional[
+                "MultiAgentEpisode"]
+        ], SampleBatch]] = None,
+        extra_action_out_fn: Optional[Callable[[
+            Policy, Dict[str, TensorType], List[TensorType], ModelV2,
+            JAXDistribution
+        ], Dict[str, TensorType]]] = None,
+        extra_grad_process_fn: Optional[Callable[[
+            Policy, "torch.optim.Optimizer", TensorType
+        ], Dict[str, TensorType]]] = None,
+        # TODO: (sven) Replace "fetches" with "process".
+        extra_learn_fetches_fn: Optional[Callable[[Policy], Dict[
+            str, TensorType]]] = None,
+        optimizer_fn: Optional[Callable[[Policy, TrainerConfigDict],
+                                        "torch.optim.Optimizer"]] = None,
+        validate_spaces: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        before_init: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        before_loss_init: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], None]] = None,
+        after_init: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        _after_loss_init: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], None]] = None,
+        action_sampler_fn: Optional[Callable[[TensorType, List[
+            TensorType]], Tuple[TensorType, TensorType]]] = None,
+        action_distribution_fn: Optional[Callable[[
+            Policy, ModelV2, TensorType, TensorType, TensorType
+        ], Tuple[TensorType, type, List[TensorType]]]] = None,
+        make_model: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], ModelV2]] = None,
+        make_model_and_action_dist: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], Tuple[ModelV2, Type[JAXDistribution]]]] = None,
+        apply_gradients_fn: Optional[Callable[
+            [Policy, "torch.optim.Optimizer"], None]] = None,
+        mixins: Optional[List[type]] = None,
+        view_requirements_fn: Optional[Callable[[Policy], Dict[
+            str, ViewRequirement]]] = None,
+        get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None
+) -> Type[JAXPolicy]:
+    """Helper function for creating a torch policy class at runtime.
+
+    Args:
+        name (str): name of the policy (e.g., "PPOTorchPolicy")
+        loss_fn (Optional[Callable[[Policy, ModelV2,
+            Type[TorchDistributionWrapper], SampleBatch], Union[TensorType,
+            List[TensorType]]]]): Callable that returns a loss tensor.
+        get_default_config (Optional[Callable[[None], TrainerConfigDict]]):
+            Optional callable that returns the default config to merge with any
+            overrides. If None, uses only(!) the user-provided
+            PartialTrainerConfigDict as dict for this Policy.
+        postprocess_fn (Optional[Callable[[Policy, SampleBatch,
+            Optional[Dict[Any, SampleBatch]], Optional["MultiAgentEpisode"]],
+            SampleBatch]]): Optional callable for post-processing experience
+            batches (called after the super's `postprocess_trajectory` method).
+        stats_fn (Optional[Callable[[Policy, SampleBatch],
+            Dict[str, TensorType]]]): Optional callable that returns a dict of
+            values given the policy and training batch. If None,
+            will use `TorchPolicy.extra_grad_info()` instead. The stats dict is
+            used for logging (e.g. in TensorBoard).
+        extra_action_out_fn (Optional[Callable[[Policy, Dict[str, TensorType],
+            List[TensorType], ModelV2, TorchDistributionWrapper]], Dict[str,
+            TensorType]]]): Optional callable that returns a dict of extra
+            values to include in experiences. If None, no extra computations
+            will be performed.
+        extra_grad_process_fn (Optional[Callable[[Policy,
+            "torch.optim.Optimizer", TensorType], Dict[str, TensorType]]]):
+            Optional callable that is called after gradients are computed and
+            returns a processing info dict. If None, will call the
+            `TorchPolicy.extra_grad_process()` method instead.
+        # TODO: (sven) dissolve naming mismatch between "learn" and "compute.."
+        extra_learn_fetches_fn (Optional[Callable[[Policy],
+            Dict[str, TensorType]]]): Optional callable that returns a dict of
+            extra tensors from the policy after loss evaluation. If None,
+            will call the `TorchPolicy.extra_compute_grad_fetches()` method
+            instead.
+        optimizer_fn (Optional[Callable[[Policy, TrainerConfigDict],
+            "torch.optim.Optimizer"]]): Optional callable that returns a
+            torch optimizer given the policy and config. If None, will call
+            the `TorchPolicy.optimizer()` method instead (which returns a
+            torch Adam optimizer).
+        validate_spaces (Optional[Callable[[Policy, gym.Space, gym.Space,
+            TrainerConfigDict], None]]): Optional callable that takes the
+            Policy, observation_space, action_space, and config to check for
+            correctness. If None, no spaces checking will be done.
+        before_init (Optional[Callable[[Policy, gym.Space, gym.Space,
+            TrainerConfigDict], None]]): Optional callable to run at the
+            beginning of `Policy.__init__` that takes the same arguments as
+            the Policy constructor. If None, this step will be skipped.
+        before_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
+            gym.spaces.Space, TrainerConfigDict], None]]): Optional callable to
+            run prior to loss init. If None, this step will be skipped.
+        after_init (Optional[Callable[[Policy, gym.Space, gym.Space,
+            TrainerConfigDict], None]]): DEPRECATED: Use `before_loss_init`
+            instead.
+        _after_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
+            gym.spaces.Space, TrainerConfigDict], None]]): Optional callable to
+            run after the loss init. If None, this step will be skipped.
+            This will be deprecated at some point and renamed into `after_init`
+            to match `build_tf_policy()` behavior.
+        action_sampler_fn (Optional[Callable[[TensorType, List[TensorType]],
+            Tuple[TensorType, TensorType]]]): Optional callable returning a
+            sampled action and its log-likelihood given some (obs and state)
+            inputs. If None, will either use `action_distribution_fn` or
+            compute actions by calling self.model, then sampling from the
+            so parameterized action distribution.
+        action_distribution_fn (Optional[Callable[[Policy, ModelV2, TensorType,
+            TensorType, TensorType], Tuple[TensorType,
+            Type[TorchDistributionWrapper], List[TensorType]]]]): A callable
+            that takes the Policy, Model, the observation batch, an
+            explore-flag, a timestep, and an is_training flag and returns a
+            tuple of a) distribution inputs (parameters), b) a dist-class to
+            generate an action distribution object from, and c) internal-state
+            outputs (empty list if not applicable). If None, will either use
+            `action_sampler_fn` or compute actions by calling self.model,
+            then sampling from the parameterized action distribution.
+        make_model (Optional[Callable[[Policy, gym.spaces.Space,
+            gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional callable
+            that takes the same arguments as Policy.__init__ and returns a
+            model instance. The distribution class will be determined
+            automatically. Note: Only one of `make_model` or
+            `make_model_and_action_dist` should be provided. If both are None,
+            a default Model will be created.
+        make_model_and_action_dist (Optional[Callable[[Policy,
+            gym.spaces.Space, gym.spaces.Space, TrainerConfigDict],
+            Tuple[ModelV2, Type[TorchDistributionWrapper]]]]): Optional
+            callable that takes the same arguments as Policy.__init__ and
+            returns a tuple of model instance and torch action distribution
+            class.
+            Note: Only one of `make_model` or `make_model_and_action_dist`
+            should be provided. If both are None, a default Model will be
+            created.
+        apply_gradients_fn (Optional[Callable[[Policy,
+            "torch.optim.Optimizer"], None]]): Optional callable that
+            takes a grads list and applies these to the Model's parameters.
+            If None, will call the `TorchPolicy.apply_gradients()` method
+            instead.
+        mixins (Optional[List[type]]): Optional list of any class mixins for
+            the returned policy class. These mixins will be applied in order
+            and will have higher precedence than the TorchPolicy class.
+        view_requirements_fn (Optional[Callable[[Policy],
+            Dict[str, ViewRequirement]]]): An optional callable to retrieve
+            additional train view requirements for this policy.
+        get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
+            Optional callable that returns the divisibility requirement for
+            sample batches. If None, will assume a value of 1.
+
+    Returns:
+        Type[TorchPolicy]: TorchPolicy child class constructed from the
+            specified args.
+    """
+
+    original_kwargs = locals().copy()
+    base = add_mixins(JAXPolicy, mixins)
+
+    class policy_cls(base):
+        def __init__(self, obs_space, action_space, config):
+            if get_default_config:
+                config = dict(get_default_config(), **config)
+            self.config = config
+
+            if validate_spaces:
+                validate_spaces(self, obs_space, action_space, self.config)
+
+            if before_init:
+                before_init(self, obs_space, action_space, self.config)
+
+            # Model is customized (use default action dist class).
+            if make_model:
+                assert make_model_and_action_dist is None, \
+                    "Either `make_model` or `make_model_and_action_dist`" \
+                    " must be None!"
+                self.model = make_model(self, obs_space, action_space, config)
+                dist_class, _ = ModelCatalog.get_action_dist(
+                    action_space, self.config["model"], framework="torch")
+            # Model and action dist class are customized.
+            elif make_model_and_action_dist:
+                self.model, dist_class = make_model_and_action_dist(
+                    self, obs_space, action_space, config)
+            # Use default model and default action dist.
+            else:
+                dist_class, logit_dim = ModelCatalog.get_action_dist(
+                    action_space, self.config["model"], framework="torch")
+                self.model = ModelCatalog.get_model_v2(
+                    obs_space=obs_space,
+                    action_space=action_space,
+                    num_outputs=logit_dim,
+                    model_config=self.config["model"],
+                    framework="torch")
+
+            # Make sure, we passed in a correct Model factory.
+            assert isinstance(self.model, TorchModelV2), \
+                "ERROR: Generated Model must be a TorchModelV2 object!"
+
+            JAXPolicy.__init__(
+                self,
+                observation_space=obs_space,
+                action_space=action_space,
+                config=config,
+                model=self.model,
+                loss=loss_fn,
+                action_distribution_class=dist_class,
+                action_sampler_fn=action_sampler_fn,
+                action_distribution_fn=action_distribution_fn,
+                max_seq_len=config["model"]["max_seq_len"],
+                get_batch_divisibility_req=get_batch_divisibility_req,
+            )
+
+            # Update this Policy's ViewRequirements (if function given).
+            if callable(view_requirements_fn):
+                self.view_requirements.update(view_requirements_fn(self))
+            # Merge Model's view requirements into Policy's.
+            self.view_requirements.update(
+                self.model.inference_view_requirements)
+
+            _before_loss_init = before_loss_init or after_init
+            if _before_loss_init:
+                _before_loss_init(self, self.observation_space,
+                                  self.action_space, config)
+
+            # Perform test runs through postprocessing- and loss functions.
+            self._initialize_loss_from_dummy_batch(
+                auto_remove_unneeded_view_reqs=True,
+                stats_fn=stats_fn,
+            )
+
+            if _after_loss_init:
+                _after_loss_init(self, obs_space, action_space, config)
+
+            # Got to reset global_timestep again after this fake run-through.
+            self.global_timestep = 0
+
+        @override(Policy)
+        def postprocess_trajectory(self,
+                                   sample_batch,
+                                   other_agent_batches=None,
+                                   episode=None):
+            # Do all post-processing always with no_grad().
+            # Not using this here will introduce a memory leak (issue #6962).
+            with torch.no_grad():
+                # Call super's postprocess_trajectory first.
+                sample_batch = super().postprocess_trajectory(
+                    sample_batch, other_agent_batches, episode)
+                if postprocess_fn:
+                    return postprocess_fn(self, sample_batch,
+                                          other_agent_batches, episode)
+
+                return sample_batch
+
+        @override(JAXPolicy)
+        def extra_grad_process(self, optimizer, loss):
+            """Called after optimizer.zero_grad() and loss.backward() calls.
+
+            Allows for gradient processing before optimizer.step() is called.
+            E.g. for gradient clipping.
+            """
+            if extra_grad_process_fn:
+                return extra_grad_process_fn(self, optimizer, loss)
+            else:
+                return JAXPolicy.extra_grad_process(self, optimizer, loss)
+
+        @override(JAXPolicy)
+        def extra_compute_grad_fetches(self):
+            if extra_learn_fetches_fn:
+                fetches = convert_to_non_torch_type(
+                    extra_learn_fetches_fn(self))
+                # Auto-add empty learner stats dict if needed.
+                return dict({LEARNER_STATS_KEY: {}}, **fetches)
+            else:
+                return JAXPolicy.extra_compute_grad_fetches(self)
+
+        @override(JAXPolicy)
+        def apply_gradients(self, gradients):
+            if apply_gradients_fn:
+                apply_gradients_fn(self, gradients)
+            else:
+                JAXPolicy.apply_gradients(self, gradients)
+
+        @override(JAXPolicy)
+        def extra_action_out(self, input_dict, state_batches, model,
+                             action_dist):
+            with torch.no_grad():
+                if extra_action_out_fn:
+                    stats_dict = extra_action_out_fn(
+                        self, input_dict, state_batches, model, action_dist)
+                else:
+                    stats_dict = JAXPolicy.extra_action_out(
+                        self, input_dict, state_batches, model, action_dist)
+                return convert_to_non_torch_type(stats_dict)
+
+        @override(JAXPolicy)
+        def optimizer(self):
+            if optimizer_fn:
+                optimizers = optimizer_fn(self, self.config)
+            else:
+                optimizers = JAXPolicy.optimizer(self)
+            optimizers = force_list(optimizers)
+            if getattr(self, "exploration", None):
+                optimizers = self.exploration.get_exploration_optimizer(
+                    optimizers)
+            return optimizers
+
+        @override(JAXPolicy)
+        def extra_grad_info(self, train_batch):
+            with torch.no_grad():
+                if stats_fn:
+                    stats_dict = stats_fn(self, train_batch)
+                else:
+                    stats_dict = JAXPolicy.extra_grad_info(self, train_batch)
+                return convert_to_non_torch_type(stats_dict)
+
+    def with_updates(**overrides):
+        """Allows creating a TorchPolicy cls based on settings of another one.
+
+        Keyword Args:
+            **overrides: The settings (passed into `build_torch_policy`) that
+                should be different from the class that this method is called
+                on.
+
+        Returns:
+            type: A new TorchPolicy sub-class.
+
+        Examples:
+        >> MySpecialDQNPolicyClass = DQNTorchPolicy.with_updates(
+        ..    name="MySpecialDQNPolicyClass",
+        ..    loss_function=[some_new_loss_function],
+        .. )
+        """
+        return build_jax_policy_class(**dict(original_kwargs, **overrides))
+
+    policy_cls.with_updates = staticmethod(with_updates)
+    policy_cls.__name__ = name
+    policy_cls.__qualname__ = name
+    return policy_cls
diff --git a/rllib/policy/policy_template.py b/rllib/policy/policy_template.py
new file mode 100644
index 000000000000..d9ded6c75e57
--- /dev/null
+++ b/rllib/policy/policy_template.py
@@ -0,0 +1,402 @@
+import gym
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.jax.jax_modelv2 import JAXModelV2
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.policy.jax.jax_policy import JAXPolicy
+from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_policy import TorchPolicy
+from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils import add_mixins, force_list, NullContextManager
+from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.framework import try_import_torch, try_import_jax
+from ray.rllib.utils.torch_ops import convert_to_non_torch_type
+from ray.rllib.utils.typing import TensorType, TrainerConfigDict
+
+jax, _ = try_import_jax()
+torch, _ = try_import_torch()
+
+
+@DeveloperAPI
+def build_policy_class(
+        name: str,
+        framework: str,
+        *,
+        loss_fn: Optional[Callable[[
+            Policy, ModelV2, Type[TorchDistributionWrapper], SampleBatch
+        ], Union[TensorType, List[TensorType]]]],
+        get_default_config: Optional[Callable[[], TrainerConfigDict]] = None,
+        stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[
+            str, TensorType]]] = None,
+        postprocess_fn: Optional[Callable[[
+            Policy, SampleBatch, Optional[Dict[Any, SampleBatch]], Optional[
+                "MultiAgentEpisode"]
+        ], SampleBatch]] = None,
+        extra_action_out_fn: Optional[Callable[[
+            Policy, Dict[str, TensorType], List[TensorType], ModelV2,
+            TorchDistributionWrapper
+        ], Dict[str, TensorType]]] = None,
+        extra_grad_process_fn: Optional[Callable[[
+            Policy, "torch.optim.Optimizer", TensorType
+        ], Dict[str, TensorType]]] = None,
+        # TODO: (sven) Replace "fetches" with "process".
+        extra_learn_fetches_fn: Optional[Callable[[Policy], Dict[
+            str, TensorType]]] = None,
+        optimizer_fn: Optional[Callable[[Policy, TrainerConfigDict],
+                                        "torch.optim.Optimizer"]] = None,
+        validate_spaces: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        before_init: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        before_loss_init: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], None]] = None,
+        after_init: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        _after_loss_init: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], None]] = None,
+        action_sampler_fn: Optional[Callable[[TensorType, List[
+            TensorType]], Tuple[TensorType, TensorType]]] = None,
+        action_distribution_fn: Optional[Callable[[
+            Policy, ModelV2, TensorType, TensorType, TensorType
+        ], Tuple[TensorType, type, List[TensorType]]]] = None,
+        make_model: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], ModelV2]] = None,
+        make_model_and_action_dist: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], Tuple[ModelV2, Type[TorchDistributionWrapper]]]] = None,
+        apply_gradients_fn: Optional[Callable[
+            [Policy, "torch.optim.Optimizer"], None]] = None,
+        mixins: Optional[List[type]] = None,
+        view_requirements_fn: Optional[Callable[[Policy], Dict[
+            str, ViewRequirement]]] = None,
+        get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None
+) -> Type[Union[JAXPolicy, TorchPolicy]]:
+    """Helper function for creating a new Policy class at runtime.
+
+    Supports frameworks JAX and PyTorch.
+
+    Args:
+        name (str): name of the policy (e.g., "PPOTorchPolicy")
+        framework (str): Either "jax" or "torch".
+        loss_fn (Optional[Callable[[Policy, ModelV2,
+            Type[TorchDistributionWrapper], SampleBatch], Union[TensorType,
+            List[TensorType]]]]): Callable that returns a loss tensor.
+        get_default_config (Optional[Callable[[None], TrainerConfigDict]]):
+            Optional callable that returns the default config to merge with any
+            overrides. If None, uses only(!) the user-provided
+            PartialTrainerConfigDict as dict for this Policy.
+        postprocess_fn (Optional[Callable[[Policy, SampleBatch,
+            Optional[Dict[Any, SampleBatch]], Optional["MultiAgentEpisode"]],
+            SampleBatch]]): Optional callable for post-processing experience
+            batches (called after the super's `postprocess_trajectory` method).
+        stats_fn (Optional[Callable[[Policy, SampleBatch],
+            Dict[str, TensorType]]]): Optional callable that returns a dict of
+            values given the policy and training batch. If None,
+            will use `TorchPolicy.extra_grad_info()` instead. The stats dict is
+            used for logging (e.g. in TensorBoard).
+        extra_action_out_fn (Optional[Callable[[Policy, Dict[str, TensorType],
+            List[TensorType], ModelV2, TorchDistributionWrapper]], Dict[str,
+            TensorType]]]): Optional callable that returns a dict of extra
+            values to include in experiences. If None, no extra computations
+            will be performed.
+        extra_grad_process_fn (Optional[Callable[[Policy,
+            "torch.optim.Optimizer", TensorType], Dict[str, TensorType]]]):
+            Optional callable that is called after gradients are computed and
+            returns a processing info dict. If None, will call the
+            `TorchPolicy.extra_grad_process()` method instead.
+        # TODO: (sven) dissolve naming mismatch between "learn" and "compute.."
+        extra_learn_fetches_fn (Optional[Callable[[Policy],
+            Dict[str, TensorType]]]): Optional callable that returns a dict of
+            extra tensors from the policy after loss evaluation. If None,
+            will call the `TorchPolicy.extra_compute_grad_fetches()` method
+            instead.
+        optimizer_fn (Optional[Callable[[Policy, TrainerConfigDict],
+            "torch.optim.Optimizer"]]): Optional callable that returns a
+            torch optimizer given the policy and config. If None, will call
+            the `TorchPolicy.optimizer()` method instead (which returns a
+            torch Adam optimizer).
+        validate_spaces (Optional[Callable[[Policy, gym.Space, gym.Space,
+            TrainerConfigDict], None]]): Optional callable that takes the
+            Policy, observation_space, action_space, and config to check for
+            correctness. If None, no spaces checking will be done.
+        before_init (Optional[Callable[[Policy, gym.Space, gym.Space,
+            TrainerConfigDict], None]]): Optional callable to run at the
+            beginning of `Policy.__init__` that takes the same arguments as
+            the Policy constructor. If None, this step will be skipped.
+        before_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
+            gym.spaces.Space, TrainerConfigDict], None]]): Optional callable to
+            run prior to loss init. If None, this step will be skipped.
+        after_init (Optional[Callable[[Policy, gym.Space, gym.Space,
+            TrainerConfigDict], None]]): DEPRECATED: Use `before_loss_init`
+            instead.
+        _after_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
+            gym.spaces.Space, TrainerConfigDict], None]]): Optional callable to
+            run after the loss init. If None, this step will be skipped.
+            This will be deprecated at some point and renamed into `after_init`
+            to match `build_tf_policy()` behavior.
+        action_sampler_fn (Optional[Callable[[TensorType, List[TensorType]],
+            Tuple[TensorType, TensorType]]]): Optional callable returning a
+            sampled action and its log-likelihood given some (obs and state)
+            inputs. If None, will either use `action_distribution_fn` or
+            compute actions by calling self.model, then sampling from the
+            so parameterized action distribution.
+        action_distribution_fn (Optional[Callable[[Policy, ModelV2, TensorType,
+            TensorType, TensorType], Tuple[TensorType,
+            Type[TorchDistributionWrapper], List[TensorType]]]]): A callable
+            that takes the Policy, Model, the observation batch, an
+            explore-flag, a timestep, and an is_training flag and returns a
+            tuple of a) distribution inputs (parameters), b) a dist-class to
+            generate an action distribution object from, and c) internal-state
+            outputs (empty list if not applicable). If None, will either use
+            `action_sampler_fn` or compute actions by calling self.model,
+            then sampling from the parameterized action distribution.
+        make_model (Optional[Callable[[Policy, gym.spaces.Space,
+            gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional callable
+            that takes the same arguments as Policy.__init__ and returns a
+            model instance. The distribution class will be determined
+            automatically. Note: Only one of `make_model` or
+            `make_model_and_action_dist` should be provided. If both are None,
+            a default Model will be created.
+        make_model_and_action_dist (Optional[Callable[[Policy,
+            gym.spaces.Space, gym.spaces.Space, TrainerConfigDict],
+            Tuple[ModelV2, Type[TorchDistributionWrapper]]]]): Optional
+            callable that takes the same arguments as Policy.__init__ and
+            returns a tuple of model instance and torch action distribution
+            class.
+            Note: Only one of `make_model` or `make_model_and_action_dist`
+            should be provided. If both are None, a default Model will be
+            created.
+        apply_gradients_fn (Optional[Callable[[Policy,
+            "torch.optim.Optimizer"], None]]): Optional callable that
+            takes a grads list and applies these to the Model's parameters.
+            If None, will call the `TorchPolicy.apply_gradients()` method
+            instead.
+        mixins (Optional[List[type]]): Optional list of any class mixins for
+            the returned policy class. These mixins will be applied in order
+            and will have higher precedence than the TorchPolicy class.
+        view_requirements_fn (Optional[Callable[[Policy],
+            Dict[str, ViewRequirement]]]): An optional callable to retrieve
+            additional train view requirements for this policy.
+        get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
+            Optional callable that returns the divisibility requirement for
+            sample batches. If None, will assume a value of 1.
+
+    Returns:
+        Type[TorchPolicy]: TorchPolicy child class constructed from the
+            specified args.
+    """
+
+    original_kwargs = locals().copy()
+    parent_cls = TorchPolicy if framework == "torch" else JAXPolicy
+    base = add_mixins(parent_cls, mixins)
+
+    class policy_cls(base):
+        def __init__(self, obs_space, action_space, config):
+            # Set up the config from possible default-config fn and given
+            # config arg.
+            if get_default_config:
+                config = dict(get_default_config(), **config)
+            self.config = config
+
+            # Set the DL framework for this Policy.
+            self.framework = framework
+            if "framework" not in self.config:
+                self.config["framework"] = framework
+            assert framework == self.config["framework"]
+
+            # Validate observation- and action-spaces.
+            if validate_spaces:
+                validate_spaces(self, obs_space, action_space, self.config)
+
+            # Do some pre-initialization steps.
+            if before_init:
+                before_init(self, obs_space, action_space, self.config)
+
+            # Model is customized (use default action dist class).
+            if make_model:
+                assert make_model_and_action_dist is None, \
+                    "Either `make_model` or `make_model_and_action_dist`" \
+                    " must be None!"
+                self.model = make_model(self, obs_space, action_space, config)
+                dist_class, _ = ModelCatalog.get_action_dist(
+                    action_space, self.config["model"], framework=framework)
+            # Model and action dist class are customized.
+            elif make_model_and_action_dist:
+                self.model, dist_class = make_model_and_action_dist(
+                    self, obs_space, action_space, config)
+            # Use default model and default action dist.
+            else:
+                dist_class, logit_dim = ModelCatalog.get_action_dist(
+                    action_space, self.config["model"], framework=framework)
+                self.model = ModelCatalog.get_model_v2(
+                    obs_space=obs_space,
+                    action_space=action_space,
+                    num_outputs=logit_dim,
+                    model_config=self.config["model"],
+                    framework=framework)
+
+            # Make sure, we passed in a correct Model factory.
+            model_cls = TorchModelV2 if framework == "torch" else JAXModelV2
+            assert isinstance(self.model, model_cls), \
+                "ERROR: Generated Model must be a TorchModelV2 object!"
+
+            # Call the framework-specific Policy constructor.
+            self.parent_cls = parent_cls
+            self.parent_cls.__init__(
+                self,
+                observation_space=obs_space,
+                action_space=action_space,
+                config=config,
+                model=self.model,
+                loss=loss_fn,
+                action_distribution_class=dist_class,
+                action_sampler_fn=action_sampler_fn,
+                action_distribution_fn=action_distribution_fn,
+                max_seq_len=config["model"]["max_seq_len"],
+                get_batch_divisibility_req=get_batch_divisibility_req,
+            )
+
+            # Update this Policy's ViewRequirements (if function given).
+            if callable(view_requirements_fn):
+                self.view_requirements.update(view_requirements_fn(self))
+            # Merge Model's view requirements into Policy's.
+            self.view_requirements.update(
+                self.model.inference_view_requirements)
+
+            _before_loss_init = before_loss_init or after_init
+            if _before_loss_init:
+                _before_loss_init(self, self.observation_space,
+                                  self.action_space, config)
+
+            # Perform test runs through postprocessing- and loss functions.
+            self._initialize_loss_from_dummy_batch(
+                auto_remove_unneeded_view_reqs=True,
+                stats_fn=stats_fn,
+            )
+
+            if _after_loss_init:
+                _after_loss_init(self, obs_space, action_space, config)
+
+            # Got to reset global_timestep again after this fake run-through.
+            self.global_timestep = 0
+
+        @override(Policy)
+        def postprocess_trajectory(self,
+                                   sample_batch,
+                                   other_agent_batches=None,
+                                   episode=None):
+            # Do all post-processing always with no_grad().
+            # Not using this here will introduce a memory leak (issue #6962).
+            with torch.no_grad():
+                # Call super's postprocess_trajectory first.
+                sample_batch = super().postprocess_trajectory(
+                    sample_batch, other_agent_batches, episode)
+                if postprocess_fn:
+                    return postprocess_fn(self, sample_batch,
+                                          other_agent_batches, episode)
+
+                return sample_batch
+
+        @override(parent_cls)
+        def extra_grad_process(self, optimizer, loss):
+            """Called after optimizer.zero_grad() and loss.backward() calls.
+
+            Allows for gradient processing before optimizer.step() is called.
+            E.g. for gradient clipping.
+            """
+            if extra_grad_process_fn:
+                return extra_grad_process_fn(self, optimizer, loss)
+            else:
+                return parent_cls.extra_grad_process(self, optimizer, loss)
+
+        @override(parent_cls)
+        def extra_compute_grad_fetches(self):
+            if extra_learn_fetches_fn:
+                fetches = convert_to_non_torch_type(
+                    extra_learn_fetches_fn(self))
+                # Auto-add empty learner stats dict if needed.
+                return dict({LEARNER_STATS_KEY: {}}, **fetches)
+            else:
+                return parent_cls.extra_compute_grad_fetches(self)
+
+        @override(parent_cls)
+        def apply_gradients(self, gradients):
+            if apply_gradients_fn:
+                apply_gradients_fn(self, gradients)
+            else:
+                parent_cls.apply_gradients(self, gradients)
+
+        @override(parent_cls)
+        def extra_action_out(self, input_dict, state_batches, model,
+                             action_dist):
+            with self._no_grad_context():
+                if extra_action_out_fn:
+                    stats_dict = extra_action_out_fn(
+                        self, input_dict, state_batches, model, action_dist)
+                else:
+                    stats_dict = parent_cls.extra_action_out(
+                        self, input_dict, state_batches, model, action_dist)
+                return self._convert_to_non_torch_type(stats_dict)
+
+        @override(parent_cls)
+        def optimizer(self):
+            if optimizer_fn:
+                optimizers = optimizer_fn(self, self.config)
+            else:
+                optimizers = parent_cls.optimizer(self)
+            optimizers = force_list(optimizers)
+            if getattr(self, "exploration", None):
+                optimizers = self.exploration.get_exploration_optimizer(
+                    optimizers)
+            return optimizers
+
+        @override(parent_cls)
+        def extra_grad_info(self, train_batch):
+            with self._no_grad_context():
+                if stats_fn:
+                    stats_dict = stats_fn(self, train_batch)
+                else:
+                    stats_dict = self.parent_cls.extra_grad_info(
+                        self, train_batch)
+                return self._convert_to_non_torch_type(stats_dict)
+
+        def _no_grad_context(self):
+            if self.framework == "torch":
+                return torch.no_grad()
+            return NullContextManager()
+
+        def _convert_to_non_torch_type(self, data):
+            if self.framework == "torch":
+                return convert_to_non_torch_type(data)
+            return data
+
+    def with_updates(**overrides):
+        """Creates a Torch|JAXPolicy cls based on settings of another one.
+
+        Keyword Args:
+            **overrides: The settings (passed into `build_torch_policy`) that
+                should be different from the class that this method is called
+                on.
+
+        Returns:
+            type: A new Torch|JAXPolicy sub-class.
+
+        Examples:
+        >> MySpecialDQNPolicyClass = DQNTorchPolicy.with_updates(
+        ..    name="MySpecialDQNPolicyClass",
+        ..    loss_function=[some_new_loss_function],
+        .. )
+        """
+        return build_policy_class(**dict(original_kwargs, **overrides))
+
+    policy_cls.with_updates = staticmethod(with_updates)
+    policy_cls.__name__ = name
+    policy_cls.__qualname__ = name
+    return policy_cls
diff --git a/rllib/policy/torch_policy_template.py b/rllib/policy/torch_policy_template.py
index dd2aadfa6fad..abb8a055507a 100644
--- a/rllib/policy/torch_policy_template.py
+++ b/rllib/policy/torch_policy_template.py
@@ -11,6 +11,7 @@
 from ray.rllib.policy.view_requirement import ViewRequirement
 from ray.rllib.utils import add_mixins, force_list
 from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.deprecation import deprecation_warning
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.torch_ops import convert_to_non_torch_type
 from ray.rllib.utils.typing import TensorType, TrainerConfigDict
@@ -186,6 +187,8 @@ def build_torch_policy(
             specified args.
     """
 
+    deprecation_warning("build_torch_policy", "build_policy_class(framework='torch')", error=False)
+
     original_kwargs = locals().copy()
     base = add_mixins(TorchPolicy, mixins)
 
diff --git a/rllib/utils/__init__.py b/rllib/utils/__init__.py
index e0c2dda3b979..276cdeaf7061 100644
--- a/rllib/utils/__init__.py
+++ b/rllib/utils/__init__.py
@@ -53,6 +53,19 @@ def force_list(elements=None, to_tuple=False):
         if type(elements) in [list, tuple] else ctor([elements])
 
 
+class NullContextManager:
+    """No-op context manager"""
+
+    def __init__(self):
+        pass
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        pass
+
+
 force_tuple = partial(force_list, to_tuple=True)
 
 __all__ = [
diff --git a/rllib/utils/exploration/curiosity.py b/rllib/utils/exploration/curiosity.py
index 175ce80193a0..a9434e1a1174 100644
--- a/rllib/utils/exploration/curiosity.py
+++ b/rllib/utils/exploration/curiosity.py
@@ -4,12 +4,13 @@
 
 from ray.rllib.models.action_dist import ActionDistribution
 from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.models.modelv2 import ModelV2, NullContextManager
+from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.tf.tf_action_dist import Categorical, MultiCategorical
 from ray.rllib.models.torch.misc import SlimFC
 from ray.rllib.models.torch.torch_action_dist import TorchCategorical, \
     TorchMultiCategorical
 from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils import NullContextManager
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.exploration.exploration import Exploration
 from ray.rllib.utils.framework import get_activation_fn, try_import_tf, \
diff --git a/rllib/utils/exploration/stochastic_sampling.py b/rllib/utils/exploration/stochastic_sampling.py
index c40fae254b5c..c3f735231ed8 100644
--- a/rllib/utils/exploration/stochastic_sampling.py
+++ b/rllib/utils/exploration/stochastic_sampling.py
@@ -8,9 +8,10 @@
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.exploration.exploration import Exploration
 from ray.rllib.utils.exploration.random import Random
-from ray.rllib.utils.framework import get_variable, try_import_tf, \
-    try_import_torch, TensorType
+from ray.rllib.utils.framework import get_variable, try_import_jax, \
+    try_import_tf, try_import_torch, TensorType
 
+jax, _ = try_import_jax()
 tf1, tf, tfv = try_import_tf()
 torch, _ = try_import_torch()
 
@@ -36,7 +37,7 @@ def __init__(self,
         Args:
             action_space (gym.spaces.Space): The gym action space used by the
                 environment.
-            framework (str): One of None, "tf", "torch".
+            framework (str): One of None, "jax", "tf", "torch".
             model (ModelV2): The ModelV2 used by the owning Policy.
             random_timesteps (int): The number of timesteps for which to act
                 completely randomly. Only after this number of timesteps,
@@ -65,9 +66,9 @@ def get_exploration_action(self,
                                action_distribution: ActionDistribution,
                                timestep: Union[int, TensorType],
                                explore: bool = True):
-        if self.framework == "torch":
-            return self._get_torch_exploration_action(action_distribution,
-                                                      timestep, explore)
+        if self.framework in ["torch", "jax"]:
+            return self._get_exploration_action(action_distribution,
+                                                timestep, explore)
         else:
             return self._get_tf_exploration_action_op(action_distribution,
                                                       timestep, explore)
@@ -114,9 +115,9 @@ def logp_false_fn():
             with tf1.control_dependencies([assign_op]):
                 return action, logp
 
-    def _get_torch_exploration_action(self, action_dist: ActionDistribution,
-                                      timestep: Union[TensorType, int],
-                                      explore: Union[TensorType, bool]):
+    def _get_exploration_action(self, action_dist: ActionDistribution,
+                                timestep: Union[TensorType, int],
+                                explore: Union[TensorType, bool]):
         # Set last timestep or (if not given) increase by one.
         self.last_timestep = timestep if timestep is not None else \
             self.last_timestep + 1
@@ -136,6 +137,7 @@ def _get_torch_exploration_action(self, action_dist: ActionDistribution,
         # No exploration -> Return deterministic actions.
         else:
             action = action_dist.deterministic_sample()
-            logp = torch.zeros_like(action_dist.sampled_action_logp())
+            fw = torch if self.framework == "torch" else jax.numpy
+            logp = fw.zeros_like(action_dist.sampled_action_logp())
 
         return action, logp
diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py
index 9025b87364cd..b5cda9964622 100644
--- a/rllib/utils/framework.py
+++ b/rllib/utils/framework.py
@@ -16,13 +16,13 @@
 
 
 def try_import_jax(error=False):
-    """Tries importing JAX and returns the module (or None).
+    """Tries importing JAX and FLAX and returns both modules (or Nones).
 
     Args:
-        error (bool): Whether to raise an error if JAX cannot be imported.
+        error (bool): Whether to raise an error if JAX/FLAX cannot be imported.
 
     Returns:
-        The jax module.
+        Tuple: The jax- and the flax modules.
 
     Raises:
         ImportError: If error=True and JAX is not installed.
@@ -281,7 +281,7 @@ def get_activation_fn(name: Optional[str] = None, framework: str = "tf"):
     elif framework == "jax":
         if name in ["linear", None]:
             return None
-        jax = try_import_jax()
+        jax, _ = try_import_jax()
         if name == "swish":
             return jax.nn.swish
         if name == "relu":
diff --git a/rllib/utils/jax_ops.py b/rllib/utils/jax_ops.py
new file mode 100644
index 000000000000..04a0a1581a7d
--- /dev/null
+++ b/rllib/utils/jax_ops.py
@@ -0,0 +1,32 @@
+from ray.rllib.utils.framework import try_import_jax
+
+jax, _ = try_import_jax()
+jnp = None
+if jax:
+    import jax.numpy as jnp
+
+
+def explained_variance(y, pred):
+    y_var = jnp.var(y, axis=[0])
+    diff_var = jnp.var(y - pred, axis=[0])
+    min_ = 1.0
+    return jnp.maximum(min_, 1 - (diff_var / y_var))
+
+
+def sequence_mask(lengths, maxlen=None, dtype=None, time_major=False):
+    """Offers same behavior as tf.sequence_mask for JAX numpy.
+
+    Thanks to Dimitris Papatheodorou
+    (https://discuss.pytorch.org/t/pytorch-equivalent-for-tf-sequence-mask/
+    39036).
+    """
+    if maxlen is None:
+        maxlen = int(lengths.max())
+
+    mask = ~(jnp.ones(
+        (len(lengths), maxlen)).cumsum(axis=1).t() > lengths)
+    if not time_major:
+        mask = mask.t()
+    mask.type(dtype or jnp.bool_)
+
+    return mask
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index 05713c7cb0cb..d0263eff5668 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -5,7 +5,7 @@
 from ray.rllib.utils.framework import try_import_jax, try_import_tf, \
     try_import_torch
 
-jax = try_import_jax()
+jax, _ = try_import_jax()
 tf1, tf, tfv = try_import_tf()
 if tf1:
     eager_mode = None

From 8057572327cab74c0de630cad29f9a475bde654e Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 10 Dec 2020 22:48:28 +0100
Subject: [PATCH 02/16] WIP.

---
 rllib/models/jax/jax_modelv2.py | 25 +++++++++++++++++++++----
 rllib/models/jax/misc.py        | 29 ++++++++++++++++++-----------
 rllib/models/modelv2.py         |  2 +-
 3 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/rllib/models/jax/jax_modelv2.py b/rllib/models/jax/jax_modelv2.py
index 447ab5a8de00..629bfbfd53e2 100644
--- a/rllib/models/jax/jax_modelv2.py
+++ b/rllib/models/jax/jax_modelv2.py
@@ -1,15 +1,16 @@
 import gym
+from typing import Dict, List, Union
 
 from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.utils.annotations import PublicAPI
+from ray.rllib.utils.annotations import override, PublicAPI
 from ray.rllib.utils.framework import try_import_jax
-from ray.rllib.utils.typing import ModelConfigDict
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
 
 
 jax, flax = try_import_jax()
 nn = None
 if flax:
-    nn = flax.linen
+    import flax.linen as nn
 
 
 @PublicAPI
@@ -23,7 +24,10 @@ def __init__(self, obs_space: gym.spaces.Space,
                  action_space: gym.spaces.Space, num_outputs: int,
                  model_config: ModelConfigDict, name: str):
         """Initializes a JAXModelV2 instance."""
-        flax.linen.Module()
+
+        nn.Module.__init__(self)
+        self._flax_module_variables = self.variables
+
         ModelV2.__init__(
             self,
             obs_space,
@@ -32,3 +36,16 @@ def __init__(self, obs_space: gym.spaces.Space,
             model_config,
             name,
             framework="jax")
+
+    @PublicAPI
+    @override(ModelV2)
+    def variables(self, as_dict: bool = False
+                  ) -> Union[List[TensorType], Dict[str, TensorType]]:
+        return self.variables
+
+    @PublicAPI
+    @override(ModelV2)
+    def trainable_variables(
+            self, as_dict: bool = False
+    ) -> Union[List[TensorType], Dict[str, TensorType]]:
+        return self.variables
diff --git a/rllib/models/jax/misc.py b/rllib/models/jax/misc.py
index 8a9397a1fdb4..106bf542e87f 100644
--- a/rllib/models/jax/misc.py
+++ b/rllib/models/jax/misc.py
@@ -4,13 +4,13 @@
 from ray.rllib.utils.framework import get_activation_fn, try_import_jax
 
 jax, flax = try_import_jax()
-nn = np = None
+nn = jnp = None
 if flax:
     import flax.linen as nn
-    import jax.numpy as np
+    import jax.numpy as jnp
 
 
-class SlimFC:
+class SlimFC(nn.Module if nn else object):
     """Simple JAX version of a fully connected layer."""
 
     def __init__(self,
@@ -35,29 +35,36 @@ def __init__(self,
                 use for initialization. If None, create a new random one.
             name (Optional[str]): An optional name for this layer.
         """
+        self.in_size = in_size
+        self.out_size = out_size
+        self.use_bias = use_bias
+        self.name = name
 
         # By default, use Glorot unform initializer.
         if initializer is None:
             initializer = flax.nn.initializers.xavier_uniform()
+        self.initializer = initializer
 
         self.prng_key = prng_key or jax.random.PRNGKey(int(time.time()))
         _, self.prng_key = jax.random.split(self.prng_key)
+
+        # Activation function (if any; default=None (linear)).
+        self.activation_fn = get_activation_fn(activation_fn, "jax")
+
+    def setup(self):
         # Create the flax dense layer.
         self._dense = nn.Dense(
-            out_size,
-            use_bias=use_bias,
-            kernel_init=initializer,
-            name=name,
+            self.out_size,
+            use_bias=self.use_bias,
+            kernel_init=self.initializer,
+            name=self.name,
         )
         # Initialize it.
         dummy_in = jax.random.normal(
-            self.prng_key, (in_size, ), dtype=np.float32)
+            self.prng_key, (self.in_size, ), dtype=jnp.float32)
         _, self.prng_key = jax.random.split(self.prng_key)
         self._params = self._dense.init(self.prng_key, dummy_in)
 
-        # Activation function (if any; default=None (linear)).
-        self.activation_fn = get_activation_fn(activation_fn, "jax")
-
     def __call__(self, x):
         out = self._dense.apply(self._params, x)
         if self.activation_fn:
diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py
index cfc106a3a14e..4c83f039e261 100644
--- a/rllib/models/modelv2.py
+++ b/rllib/models/modelv2.py
@@ -280,7 +280,7 @@ def variables(self, as_dict: bool = False
 
         Args:
             as_dict(bool): Whether variables should be returned as dict-values
-                (using descriptive keys).
+                (using descriptive str keys).
 
         Returns:
             Union[List[any],Dict[str,any]]: The list (or dict if `as_dict` is

From cf6f408abc9e20df7ae6adfa791cedae93748e2c Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 17 Dec 2020 19:27:43 +0100
Subject: [PATCH 03/16] WIP.

---
 rllib/models/jax/fcnet.py             |  87 +++-----
 rllib/models/jax/jax_modelv2.py       |   7 +-
 rllib/models/jax/modules/__init__.py  |   0
 rllib/models/jax/modules/fc_stack.py  |  53 +++++
 rllib/models/tests/test_jax_models.py | 196 ++++++++++++++++
 rllib/policy/policy_template.py       |   7 +-
 rllib/policy/torch_policy_template.py | 307 +-------------------------
 7 files changed, 296 insertions(+), 361 deletions(-)
 create mode 100644 rllib/models/jax/modules/__init__.py
 create mode 100644 rllib/models/jax/modules/fc_stack.py
 create mode 100644 rllib/models/tests/test_jax_models.py

diff --git a/rllib/models/jax/fcnet.py b/rllib/models/jax/fcnet.py
index c2e0d50eda6c..b332bf7f57a5 100644
--- a/rllib/models/jax/fcnet.py
+++ b/rllib/models/jax/fcnet.py
@@ -3,7 +3,7 @@
 import time
 
 from ray.rllib.models.jax.jax_modelv2 import JAXModelV2
-from ray.rllib.models.jax.misc import SlimFC
+from ray.rllib.models.jax.modules import FCStack
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_jax
 
@@ -25,78 +25,54 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
         activation = model_config.get("fcnet_activation")
         hiddens = model_config.get("fcnet_hiddens", [])
         no_final_linear = model_config.get("no_final_linear")
+        in_features = int(np.product(obs_space.shape))
         self.vf_share_layers = model_config.get("vf_share_layers")
         self.free_log_std = model_config.get("free_log_std")
 
-        # Generate free-floating bias variables for the second half of
-        # the outputs.
         if self.free_log_std:
-            assert num_outputs % 2 == 0, (
-                "num_outputs must be divisible by two", num_outputs)
-            num_outputs = num_outputs // 2
+            raise ValueError("`free_log_std` not supported for JAX yet!")
 
-        self._hidden_layers = []
-        prev_layer_size = int(np.product(obs_space.shape))
         self._logits = None
 
-        # Create layers 0 to second-last.
-        for size in hiddens[:-1]:
-            self._hidden_layers.append(
-                SlimFC(
-                    in_size=prev_layer_size,
-                    out_size=size,
-                    activation_fn=activation))
-            prev_layer_size = size
-
         # The last layer is adjusted to be of size num_outputs, but it's a
         # layer with activation.
         if no_final_linear and num_outputs:
-            self._hidden_layers.append(
-                SlimFC(
-                    in_size=prev_layer_size,
-                    out_size=num_outputs,
-                    activation_fn=activation))
-            prev_layer_size = num_outputs
+            self._hidden_layers = FCStack(
+                in_features=in_features,
+                layers=hiddens + [num_outputs],
+                activation=activation,
+            )
+
         # Finish the layers with the provided sizes (`hiddens`), plus -
         # iff num_outputs > 0 - a last linear layer of size num_outputs.
         else:
             if len(hiddens) > 0:
-                self._hidden_layers.append(
-                    SlimFC(
-                        in_size=prev_layer_size,
-                        out_size=hiddens[-1],
-                        activation_fn=activation))
+                self._hidden_layers = FCStack(
+                    in_features=in_features,
+                    layers=hiddens,
+                    activation=activation
+                )
                 prev_layer_size = hiddens[-1]
             if num_outputs:
-                self._logits = SlimFC(
-                    in_size=prev_layer_size,
-                    out_size=num_outputs,
-                    activation_fn=None)
+                self._logits = FCStack(
+                    in_features=prev_layer_size,
+                    layers=[num_outputs],
+                    activation=None,
+                )
             else:
                 self.num_outputs = (
                     [int(np.product(obs_space.shape))] + hiddens[-1:])[-1]
 
-        # Layer to add the log std vars to the state-dependent means.
-        if self.free_log_std and self._logits:
-            raise ValueError("`free_log_std` not supported for JAX yet!")
-
         self._value_branch_separate = None
         if not self.vf_share_layers:
             # Build a parallel set of hidden layers for the value net.
-            prev_vf_layer_size = int(np.product(obs_space.shape))
-            vf_layers = []
-            for size in hiddens:
-                vf_layers.append(
-                    SlimFC(
-                        in_size=prev_vf_layer_size,
-                        out_size=size,
-                        activation_fn=activation,
-                    ))
-                prev_vf_layer_size = size
-            self._value_branch_separate = vf_layers
-
-        self._value_branch = SlimFC(
-            in_size=prev_layer_size, out_size=1, activation_fn=None)
+            self._value_branch_separate = FCStack(
+                in_features=int(np.product(obs_space.shape)),
+                layers=hiddens,
+                activation=activation,
+            )
+        self._value_branch = FCStack(
+            in_features=prev_layer_size, layers=[1])
         # Holds the current "base" output (before logits layer).
         self._features = None
         # Holds the last input, in case value branch is separate.
@@ -105,23 +81,16 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
     @override(JAXModelV2)
     def forward(self, input_dict, state, seq_lens):
         self._last_flat_in = input_dict["obs_flat"]
-        x = self._last_flat_in
-        for layer in self._hidden_layers:
-            x = layer(x)
-        self._features = x
+        self._features = self._hidden_layers(self._last_flat_in)
         logits = self._logits(self._features) if self._logits else \
             self._features
-        if self.free_log_std:
-            logits = self._append_free_log_std(logits)
         return logits, state
 
     @override(JAXModelV2)
     def value_function(self):
         assert self._features is not None, "must call forward() first"
         if self._value_branch_separate:
-            x = self._last_flat_in
-            for layer in self._value_branch_separate:
-                x = layer(x)
+            x = self._value_branch_separate(self._last_flat_in)
             return self._value_branch(x).squeeze(1)
         else:
             return self._value_branch(self._features).squeeze(1)
diff --git a/rllib/models/jax/jax_modelv2.py b/rllib/models/jax/jax_modelv2.py
index 629bfbfd53e2..c6cd9643c40d 100644
--- a/rllib/models/jax/jax_modelv2.py
+++ b/rllib/models/jax/jax_modelv2.py
@@ -14,9 +14,11 @@
 
 
 @PublicAPI
-class JAXModelV2(ModelV2, nn.Module if nn else object):
+class JAXModelV2(ModelV2):
     """JAX version of ModelV2.
 
+
+
     Note that this class by itself is not a valid model unless you
     implement forward() in a subclass."""
 
@@ -25,9 +27,6 @@ def __init__(self, obs_space: gym.spaces.Space,
                  model_config: ModelConfigDict, name: str):
         """Initializes a JAXModelV2 instance."""
 
-        nn.Module.__init__(self)
-        self._flax_module_variables = self.variables
-
         ModelV2.__init__(
             self,
             obs_space,
diff --git a/rllib/models/jax/modules/__init__.py b/rllib/models/jax/modules/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rllib/models/jax/modules/fc_stack.py b/rllib/models/jax/modules/fc_stack.py
new file mode 100644
index 000000000000..54ae9deab730
--- /dev/null
+++ b/rllib/models/jax/modules/fc_stack.py
@@ -0,0 +1,53 @@
+import logging
+import numpy as np
+import time
+
+from ray.rllib.models.jax.misc import get_activation_fn, SlimFC
+from ray.rllib.utils.framework import try_import_jax
+
+jax, flax = try_import_jax()
+nn = None
+if flax:
+    nn = flax.linen
+
+logger = logging.getLogger(__name__)
+
+
+class FCStack(nn.Module if nn else object):
+    """Generic fully connected FLAX module."""
+
+    def __init__(self, in_features, layers, activation=None, prng_key=None):
+        """Initializes a FCStack instance.
+
+        Args:
+            in_features (int): Number of input features (input dim).
+            layers (List[int]): List of Dense layer sizes.
+            activation (Optional[Union[Callable, str]]): An optional activation
+                function or activation function specifier (str), such as
+                "relu". Use None or "linear" for no activation.
+        """
+        super().__init__()
+
+        self.prng_key = prng_key or jax.random.PRNGKey(int(time.time()))
+        activation_fn = get_activation_fn(activation, framework="jax")
+
+        # Create all layers.
+        self._layers = []
+        prev_layer_size = in_features
+        for size in layers:
+            self._hidden_layers.append(
+                SlimFC(
+                    in_size=prev_layer_size,
+                    out_size=size,
+                    use_bias=self.use_bias,
+                    initializer=self.initializer,
+                    activation_fn=activation_fn,
+                    prng_key=self.prng_key,
+                ))
+            prev_layer_size = size
+
+    def __call__(self, inputs):
+        x = inputs
+        for layer in self._hidden_layers:
+            x = layer(x)
+        return x
diff --git a/rllib/models/tests/test_jax_models.py b/rllib/models/tests/test_jax_models.py
new file mode 100644
index 000000000000..c5179714533c
--- /dev/null
+++ b/rllib/models/tests/test_jax_models.py
@@ -0,0 +1,196 @@
+from functools import partial
+import numpy as np
+from gym.spaces import Box, Dict, Tuple
+from scipy.stats import beta, norm
+import tree
+import unittest
+
+from ray.rllib.models.jax.jax_action_dist import JAXCategorical
+from ray.rllib.models.tf.tf_action_dist import Beta, Categorical, \
+    DiagGaussian, GumbelSoftmax, MultiActionDistribution, MultiCategorical, \
+    SquashedGaussian
+from ray.rllib.models.torch.torch_action_dist import TorchBeta, \
+    TorchCategorical, TorchDiagGaussian, TorchMultiActionDistribution, \
+    TorchMultiCategorical, TorchSquashedGaussian
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.numpy import MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT, \
+    softmax, SMALL_NUMBER, LARGE_INTEGER
+from ray.rllib.utils.test_utils import check, framework_iterator
+
+tf1, tf, tfv = try_import_tf()
+torch, _ = try_import_torch()
+
+
+class TestJAXModels(unittest.TestCase):
+
+    def test_jax_fcnet(self):
+        """Tests the MultiActionDistribution (across all frameworks)."""
+        batch_size = 1000
+        input_space = Tuple([
+            Box(-10.0, 10.0, shape=(batch_size, 4)),
+            Box(-2.0, 2.0, shape=(
+                batch_size,
+                6,
+            )),
+            Dict({
+                "a": Box(-1.0, 1.0, shape=(batch_size, 4))
+            }),
+        ])
+        std_space = Box(
+            -0.05, 0.05, shape=(
+                batch_size,
+                3,
+            ))
+
+        low, high = -1.0, 1.0
+        value_space = Tuple([
+            Box(0, 3, shape=(batch_size, ), dtype=np.int32),
+            Box(-2.0, 2.0, shape=(batch_size, 3), dtype=np.float32),
+            Dict({
+                "a": Box(0.0, 1.0, shape=(batch_size, 2), dtype=np.float32)
+            })
+        ])
+
+        for fw, sess in framework_iterator(session=True):
+            if fw == "torch":
+                cls = TorchMultiActionDistribution
+                child_distr_cls = [
+                    TorchCategorical, TorchDiagGaussian,
+                    partial(TorchBeta, low=low, high=high)
+                ]
+            else:
+                cls = MultiActionDistribution
+                child_distr_cls = [
+                    Categorical,
+                    DiagGaussian,
+                    partial(Beta, low=low, high=high),
+                ]
+
+            inputs = list(input_space.sample())
+            distr = cls(
+                np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1),
+                model={},
+                action_space=value_space,
+                child_distributions=child_distr_cls,
+                input_lens=[4, 6, 4])
+
+            # Adjust inputs for the Beta distr just as Beta itself does.
+            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
+                                     -np.log(SMALL_NUMBER))
+            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
+            # Sample deterministically.
+            expected_det = [
+                np.argmax(inputs[0], axis=-1),
+                inputs[1][:, :3],  # [:3]=Mean values.
+                # Mean for a Beta distribution:
+                # 1 / [1 + (beta/alpha)] * range + low
+                (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2]))
+                * (high - low) + low,
+            ]
+            out = distr.deterministic_sample()
+            if sess:
+                out = sess.run(out)
+            check(out[0], expected_det[0])
+            check(out[1], expected_det[1])
+            check(out[2]["a"], expected_det[2])
+
+            # Stochastic sampling -> expect roughly the mean.
+            inputs = list(input_space.sample())
+            # Fix categorical inputs (not needed for distribution itself, but
+            # for our expectation calculations).
+            inputs[0] = softmax(inputs[0], -1)
+            # Fix std inputs (shouldn't be too large for this test).
+            inputs[1][:, 3:] = std_space.sample()
+            # Adjust inputs for the Beta distr just as Beta itself does.
+            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
+                                     -np.log(SMALL_NUMBER))
+            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
+            distr = cls(
+                np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1),
+                model={},
+                action_space=value_space,
+                child_distributions=child_distr_cls,
+                input_lens=[4, 6, 4])
+            expected_mean = [
+                np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)),
+                inputs[1][:, :3],  # [:3]=Mean values.
+                # Mean for a Beta distribution:
+                # 1 / [1 + (beta/alpha)] * range + low
+                (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) *
+                (high - low) + low,
+            ]
+            out = distr.sample()
+            if sess:
+                out = sess.run(out)
+            out = list(out)
+            if fw == "torch":
+                out[0] = out[0].numpy()
+                out[1] = out[1].numpy()
+                out[2]["a"] = out[2]["a"].numpy()
+            check(np.mean(out[0]), expected_mean[0], decimals=1)
+            check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1)
+            check(
+                np.mean(out[2]["a"], 0),
+                np.mean(expected_mean[2], 0),
+                decimals=1)
+
+            # Test log-likelihood outputs.
+            # Make sure beta-values are within 0.0 and 1.0 for the numpy
+            # calculation (which doesn't have scaling).
+            inputs = list(input_space.sample())
+            # Adjust inputs for the Beta distr just as Beta itself does.
+            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
+                                     -np.log(SMALL_NUMBER))
+            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
+            distr = cls(
+                np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1),
+                model={},
+                action_space=value_space,
+                child_distributions=child_distr_cls,
+                input_lens=[4, 6, 4])
+            inputs[0] = softmax(inputs[0], -1)
+            values = list(value_space.sample())
+            log_prob_beta = np.log(
+                beta.pdf(values[2]["a"], inputs[2]["a"][:, :2],
+                         inputs[2]["a"][:, 2:]))
+            # Now do the up-scaling for [2] (beta values) to be between
+            # low/high.
+            values[2]["a"] = values[2]["a"] * (high - low) + low
+            inputs[1][:, 3:] = np.exp(inputs[1][:, 3:])
+            expected_log_llh = np.sum(
+                np.concatenate([
+                    np.expand_dims(
+                        np.log(
+                            [i[values[0][j]]
+                             for j, i in enumerate(inputs[0])]), -1),
+                    np.log(
+                        norm.pdf(values[1], inputs[1][:, :3],
+                                 inputs[1][:, 3:])), log_prob_beta
+                ], -1), -1)
+
+            values[0] = np.expand_dims(values[0], -1)
+            if fw == "torch":
+                values = tree.map_structure(lambda s: torch.Tensor(s), values)
+            # Test all flattened input.
+            concat = np.concatenate(tree.flatten(values),
+                                    -1).astype(np.float32)
+            out = distr.logp(concat)
+            if sess:
+                out = sess.run(out)
+            check(out, expected_log_llh, atol=15)
+            # Test structured input.
+            out = distr.logp(values)
+            if sess:
+                out = sess.run(out)
+            check(out, expected_log_llh, atol=15)
+            # Test flattened input.
+            out = distr.logp(tree.flatten(values))
+            if sess:
+                out = sess.run(out)
+            check(out, expected_log_llh, atol=15)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
diff --git a/rllib/policy/policy_template.py b/rllib/policy/policy_template.py
index d9ded6c75e57..4d75ff989451 100644
--- a/rllib/policy/policy_template.py
+++ b/rllib/policy/policy_template.py
@@ -21,6 +21,7 @@
 torch, _ = try_import_torch()
 
 
+# TODO: (sven) Unify this with `build_tf_policy` as well.
 @DeveloperAPI
 def build_policy_class(
         name: str,
@@ -293,8 +294,9 @@ def postprocess_trajectory(self,
                                    other_agent_batches=None,
                                    episode=None):
             # Do all post-processing always with no_grad().
-            # Not using this here will introduce a memory leak (issue #6962).
-            with torch.no_grad():
+            # Not using this here will introduce a memory leak
+            # in torch (issue #6962).
+            with self._no_grad_context():
                 # Call super's postprocess_trajectory first.
                 sample_batch = super().postprocess_trajectory(
                     sample_batch, other_agent_batches, episode)
@@ -396,6 +398,7 @@ def with_updates(**overrides):
         """
         return build_policy_class(**dict(original_kwargs, **overrides))
 
+
     policy_cls.with_updates = staticmethod(with_updates)
     policy_cls.__name__ = name
     policy_cls.__qualname__ = name
diff --git a/rllib/policy/torch_policy_template.py b/rllib/policy/torch_policy_template.py
index 998411aa6270..2da390c56e6e 100644
--- a/rllib/policy/torch_policy_template.py
+++ b/rllib/policy/torch_policy_template.py
@@ -1,18 +1,15 @@
 import gym
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
-from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
-from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
-from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.torch_policy import TorchPolicy
-from ray.rllib.utils import add_mixins, force_list
-from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.utils.deprecation import deprecation_warning
 from ray.rllib.utils.framework import try_import_torch
-from ray.rllib.utils.torch_ops import convert_to_non_torch_type
 from ray.rllib.utils.typing import TensorType, TrainerConfigDict
 
 torch, _ = try_import_torch()
@@ -39,7 +36,6 @@ def build_torch_policy(
         extra_grad_process_fn: Optional[Callable[[
             Policy, "torch.optim.Optimizer", TensorType
         ], Dict[str, TensorType]]] = None,
-        # TODO: (sven) Replace "fetches" with "process".
         extra_learn_fetches_fn: Optional[Callable[[Policy], Dict[
             str, TensorType]]] = None,
         optimizer_fn: Optional[Callable[[Policy, TrainerConfigDict],
@@ -72,293 +68,12 @@ def build_torch_policy(
         mixins: Optional[List[type]] = None,
         get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None
 ) -> Type[TorchPolicy]:
-    """Helper function for creating a torch policy class at runtime.
 
-    Args:
-        name (str): name of the policy (e.g., "PPOTorchPolicy")
-        loss_fn (Optional[Callable[[Policy, ModelV2,
-            Type[TorchDistributionWrapper], SampleBatch], Union[TensorType,
-            List[TensorType]]]]): Callable that returns a loss tensor.
-        get_default_config (Optional[Callable[[None], TrainerConfigDict]]):
-            Optional callable that returns the default config to merge with any
-            overrides. If None, uses only(!) the user-provided
-            PartialTrainerConfigDict as dict for this Policy.
-        postprocess_fn (Optional[Callable[[Policy, SampleBatch,
-            Optional[Dict[Any, SampleBatch]], Optional["MultiAgentEpisode"]],
-            SampleBatch]]): Optional callable for post-processing experience
-            batches (called after the super's `postprocess_trajectory` method).
-        stats_fn (Optional[Callable[[Policy, SampleBatch],
-            Dict[str, TensorType]]]): Optional callable that returns a dict of
-            values given the policy and training batch. If None,
-            will use `TorchPolicy.extra_grad_info()` instead. The stats dict is
-            used for logging (e.g. in TensorBoard).
-        extra_action_out_fn (Optional[Callable[[Policy, Dict[str, TensorType],
-            List[TensorType], ModelV2, TorchDistributionWrapper]], Dict[str,
-            TensorType]]]): Optional callable that returns a dict of extra
-            values to include in experiences. If None, no extra computations
-            will be performed.
-        extra_grad_process_fn (Optional[Callable[[Policy,
-            "torch.optim.Optimizer", TensorType], Dict[str, TensorType]]]):
-            Optional callable that is called after gradients are computed and
-            returns a processing info dict. If None, will call the
-            `TorchPolicy.extra_grad_process()` method instead.
-        # TODO: (sven) dissolve naming mismatch between "learn" and "compute.."
-        extra_learn_fetches_fn (Optional[Callable[[Policy],
-            Dict[str, TensorType]]]): Optional callable that returns a dict of
-            extra tensors from the policy after loss evaluation. If None,
-            will call the `TorchPolicy.extra_compute_grad_fetches()` method
-            instead.
-        optimizer_fn (Optional[Callable[[Policy, TrainerConfigDict],
-            "torch.optim.Optimizer"]]): Optional callable that returns a
-            torch optimizer given the policy and config. If None, will call
-            the `TorchPolicy.optimizer()` method instead (which returns a
-            torch Adam optimizer).
-        validate_spaces (Optional[Callable[[Policy, gym.Space, gym.Space,
-            TrainerConfigDict], None]]): Optional callable that takes the
-            Policy, observation_space, action_space, and config to check for
-            correctness. If None, no spaces checking will be done.
-        before_init (Optional[Callable[[Policy, gym.Space, gym.Space,
-            TrainerConfigDict], None]]): Optional callable to run at the
-            beginning of `Policy.__init__` that takes the same arguments as
-            the Policy constructor. If None, this step will be skipped.
-        before_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
-            gym.spaces.Space, TrainerConfigDict], None]]): Optional callable to
-            run prior to loss init. If None, this step will be skipped.
-        after_init (Optional[Callable[[Policy, gym.Space, gym.Space,
-            TrainerConfigDict], None]]): DEPRECATED: Use `before_loss_init`
-            instead.
-        _after_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
-            gym.spaces.Space, TrainerConfigDict], None]]): Optional callable to
-            run after the loss init. If None, this step will be skipped.
-            This will be deprecated at some point and renamed into `after_init`
-            to match `build_tf_policy()` behavior.
-        action_sampler_fn (Optional[Callable[[TensorType, List[TensorType]],
-            Tuple[TensorType, TensorType]]]): Optional callable returning a
-            sampled action and its log-likelihood given some (obs and state)
-            inputs. If None, will either use `action_distribution_fn` or
-            compute actions by calling self.model, then sampling from the
-            so parameterized action distribution.
-        action_distribution_fn (Optional[Callable[[Policy, ModelV2, TensorType,
-            TensorType, TensorType], Tuple[TensorType,
-            Type[TorchDistributionWrapper], List[TensorType]]]]): A callable
-            that takes the Policy, Model, the observation batch, an
-            explore-flag, a timestep, and an is_training flag and returns a
-            tuple of a) distribution inputs (parameters), b) a dist-class to
-            generate an action distribution object from, and c) internal-state
-            outputs (empty list if not applicable). If None, will either use
-            `action_sampler_fn` or compute actions by calling self.model,
-            then sampling from the parameterized action distribution.
-        make_model (Optional[Callable[[Policy, gym.spaces.Space,
-            gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional callable
-            that takes the same arguments as Policy.__init__ and returns a
-            model instance. The distribution class will be determined
-            automatically. Note: Only one of `make_model` or
-            `make_model_and_action_dist` should be provided. If both are None,
-            a default Model will be created.
-        make_model_and_action_dist (Optional[Callable[[Policy,
-            gym.spaces.Space, gym.spaces.Space, TrainerConfigDict],
-            Tuple[ModelV2, Type[TorchDistributionWrapper]]]]): Optional
-            callable that takes the same arguments as Policy.__init__ and
-            returns a tuple of model instance and torch action distribution
-            class.
-            Note: Only one of `make_model` or `make_model_and_action_dist`
-            should be provided. If both are None, a default Model will be
-            created.
-        apply_gradients_fn (Optional[Callable[[Policy,
-            "torch.optim.Optimizer"], None]]): Optional callable that
-            takes a grads list and applies these to the Model's parameters.
-            If None, will call the `TorchPolicy.apply_gradients()` method
-            instead.
-        mixins (Optional[List[type]]): Optional list of any class mixins for
-            the returned policy class. These mixins will be applied in order
-            and will have higher precedence than the TorchPolicy class.
-        get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
-            Optional callable that returns the divisibility requirement for
-            sample batches. If None, will assume a value of 1.
-
-    Returns:
-        Type[TorchPolicy]: TorchPolicy child class constructed from the
-            specified args.
-    """
-
-    deprecation_warning("build_torch_policy", "build_policy_class(framework='torch')", error=False)
-
-    original_kwargs = locals().copy()
-    base = add_mixins(TorchPolicy, mixins)
-
-    class policy_cls(base):
-        def __init__(self, obs_space, action_space, config):
-            if get_default_config:
-                config = dict(get_default_config(), **config)
-            self.config = config
-
-            if validate_spaces:
-                validate_spaces(self, obs_space, action_space, self.config)
-
-            if before_init:
-                before_init(self, obs_space, action_space, self.config)
-
-            # Model is customized (use default action dist class).
-            if make_model:
-                assert make_model_and_action_dist is None, \
-                    "Either `make_model` or `make_model_and_action_dist`" \
-                    " must be None!"
-                self.model = make_model(self, obs_space, action_space, config)
-                dist_class, _ = ModelCatalog.get_action_dist(
-                    action_space, self.config["model"], framework="torch")
-            # Model and action dist class are customized.
-            elif make_model_and_action_dist:
-                self.model, dist_class = make_model_and_action_dist(
-                    self, obs_space, action_space, config)
-            # Use default model and default action dist.
-            else:
-                dist_class, logit_dim = ModelCatalog.get_action_dist(
-                    action_space, self.config["model"], framework="torch")
-                self.model = ModelCatalog.get_model_v2(
-                    obs_space=obs_space,
-                    action_space=action_space,
-                    num_outputs=logit_dim,
-                    model_config=self.config["model"],
-                    framework="torch")
-
-            # Make sure, we passed in a correct Model factory.
-            assert isinstance(self.model, TorchModelV2), \
-                "ERROR: Generated Model must be a TorchModelV2 object!"
-
-            TorchPolicy.__init__(
-                self,
-                observation_space=obs_space,
-                action_space=action_space,
-                config=config,
-                model=self.model,
-                loss=loss_fn,
-                action_distribution_class=dist_class,
-                action_sampler_fn=action_sampler_fn,
-                action_distribution_fn=action_distribution_fn,
-                max_seq_len=config["model"]["max_seq_len"],
-                get_batch_divisibility_req=get_batch_divisibility_req,
-            )
-
-            # Merge Model's view requirements into Policy's.
-            self.view_requirements.update(
-                self.model.inference_view_requirements)
-
-            _before_loss_init = before_loss_init or after_init
-            if _before_loss_init:
-                _before_loss_init(self, self.observation_space,
-                                  self.action_space, config)
-
-            # Perform test runs through postprocessing- and loss functions.
-            self._initialize_loss_from_dummy_batch(
-                auto_remove_unneeded_view_reqs=True,
-                stats_fn=stats_fn,
-            )
-
-            if _after_loss_init:
-                _after_loss_init(self, obs_space, action_space, config)
-
-            # Got to reset global_timestep again after this fake run-through.
-            self.global_timestep = 0
-
-        @override(Policy)
-        def postprocess_trajectory(self,
-                                   sample_batch,
-                                   other_agent_batches=None,
-                                   episode=None):
-            # Do all post-processing always with no_grad().
-            # Not using this here will introduce a memory leak (issue #6962).
-            with torch.no_grad():
-                # Call super's postprocess_trajectory first.
-                sample_batch = super().postprocess_trajectory(
-                    sample_batch, other_agent_batches, episode)
-                if postprocess_fn:
-                    return postprocess_fn(self, sample_batch,
-                                          other_agent_batches, episode)
-
-                return sample_batch
-
-        @override(TorchPolicy)
-        def extra_grad_process(self, optimizer, loss):
-            """Called after optimizer.zero_grad() and loss.backward() calls.
-
-            Allows for gradient processing before optimizer.step() is called.
-            E.g. for gradient clipping.
-            """
-            if extra_grad_process_fn:
-                return extra_grad_process_fn(self, optimizer, loss)
-            else:
-                return TorchPolicy.extra_grad_process(self, optimizer, loss)
-
-        @override(TorchPolicy)
-        def extra_compute_grad_fetches(self):
-            if extra_learn_fetches_fn:
-                fetches = convert_to_non_torch_type(
-                    extra_learn_fetches_fn(self))
-                # Auto-add empty learner stats dict if needed.
-                return dict({LEARNER_STATS_KEY: {}}, **fetches)
-            else:
-                return TorchPolicy.extra_compute_grad_fetches(self)
-
-        @override(TorchPolicy)
-        def apply_gradients(self, gradients):
-            if apply_gradients_fn:
-                apply_gradients_fn(self, gradients)
-            else:
-                TorchPolicy.apply_gradients(self, gradients)
-
-        @override(TorchPolicy)
-        def extra_action_out(self, input_dict, state_batches, model,
-                             action_dist):
-            with torch.no_grad():
-                if extra_action_out_fn:
-                    stats_dict = extra_action_out_fn(
-                        self, input_dict, state_batches, model, action_dist)
-                else:
-                    stats_dict = TorchPolicy.extra_action_out(
-                        self, input_dict, state_batches, model, action_dist)
-                return convert_to_non_torch_type(stats_dict)
-
-        @override(TorchPolicy)
-        def optimizer(self):
-            if optimizer_fn:
-                optimizers = optimizer_fn(self, self.config)
-            else:
-                optimizers = TorchPolicy.optimizer(self)
-            optimizers = force_list(optimizers)
-            if getattr(self, "exploration", None):
-                optimizers = self.exploration.get_exploration_optimizer(
-                    optimizers)
-            return optimizers
-
-        @override(TorchPolicy)
-        def extra_grad_info(self, train_batch):
-            with torch.no_grad():
-                if stats_fn:
-                    stats_dict = stats_fn(self, train_batch)
-                else:
-                    stats_dict = TorchPolicy.extra_grad_info(self, train_batch)
-                return convert_to_non_torch_type(stats_dict)
-
-    def with_updates(**overrides):
-        """Allows creating a TorchPolicy cls based on settings of another one.
-
-        Keyword Args:
-            **overrides: The settings (passed into `build_torch_policy`) that
-                should be different from the class that this method is called
-                on.
-
-        Returns:
-            type: A new TorchPolicy sub-class.
-
-        Examples:
-        >> MySpecialDQNPolicyClass = DQNTorchPolicy.with_updates(
-        ..    name="MySpecialDQNPolicyClass",
-        ..    loss_function=[some_new_loss_function],
-        .. )
-        """
-        return build_torch_policy(**dict(original_kwargs, **overrides))
-
-    policy_cls.with_updates = staticmethod(with_updates)
-    policy_cls.__name__ = name
-    policy_cls.__qualname__ = name
-    return policy_cls
+    deprecation_warning(
+        old="build_torch_policy",
+        new="build_policy_class(framework='torch')",
+        error=False)
+    kwargs = locals().copy()
+    # Set to torch and call new function.
+    kwargs["framework"] = "torch"
+    return build_policy_class(**kwargs)

From 062da38a001c731881a70897f554d9e55fa6c9b0 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 18 Dec 2020 20:19:20 +0100
Subject: [PATCH 04/16] WIP

---
 rllib/agents/a3c/a3c_torch_policy.py          |   5 +-
 rllib/agents/ars/ars_torch_policy.py          |   5 +-
 rllib/agents/ddpg/ddpg_torch_model.py         |   3 +-
 rllib/agents/ddpg/ddpg_torch_policy.py        |   5 +-
 rllib/agents/dqn/dqn_torch_policy.py          |   5 +-
 rllib/agents/dqn/simple_q_torch_policy.py     |   5 +-
 rllib/agents/dreamer/dreamer_torch_policy.py  |   9 +-
 rllib/agents/es/es_torch_policy.py            |   5 +-
 rllib/agents/impala/vtrace_torch_policy.py    |   5 +-
 rllib/agents/maml/maml_tf_policy.py           |   2 +-
 rllib/agents/maml/maml_torch_policy.py        |   5 +-
 rllib/agents/marwil/marwil_torch_policy.py    |   5 +-
 rllib/agents/mbmpo/mbmpo_torch_policy.py      |   5 +-
 rllib/agents/pg/pg_torch_policy.py            |   5 +-
 rllib/agents/ppo/appo_torch_policy.py         |   5 +-
 rllib/agents/qmix/qmix_policy.py              |   2 +-
 rllib/agents/sac/sac_torch_model.py           |   3 +-
 rllib/agents/sac/sac_torch_policy.py          |   5 +-
 rllib/agents/slateq/slateq_torch_policy.py    |   6 +-
 rllib/contrib/bandits/agents/policy.py        |   5 +-
 rllib/examples/custom_torch_policy.py         |   8 +-
 rllib/models/jax/fcnet.py                     |  22 +-
 rllib/models/jax/jax_modelv2.py               |   3 +
 rllib/models/jax/misc.py                      |  74 ++--
 rllib/models/jax/modules/fc_stack.py          |  65 +--
 rllib/models/tests/test_jax_models.py         | 204 ++--------
 rllib/models/tf/fcnet.py                      |   3 +-
 rllib/models/tf/layers/noisy_layer.py         |   6 +-
 rllib/models/tf/visionnet.py                  |   4 +-
 rllib/models/torch/misc.py                    |   3 +-
 .../torch/modules/convtranspose2d_stack.py    |   4 +-
 rllib/models/torch/modules/noisy_layer.py     |   4 +-
 rllib/models/utils.py                         |  81 +++-
 rllib/policy/__init__.py                      |   2 +-
 rllib/policy/jax/__init__.py                  |   0
 rllib/policy/jax/jax_policy_template.py       | 370 ------------------
 rllib/policy/{jax => }/jax_policy.py          |   0
 rllib/policy/policy_template.py               |   2 +-
 rllib/policy/torch_policy_template.py         |  10 +-
 rllib/utils/exploration/curiosity.py          |   3 +-
 rllib/utils/framework.py                      |  15 +-
 41 files changed, 279 insertions(+), 699 deletions(-)
 delete mode 100644 rllib/policy/jax/__init__.py
 delete mode 100644 rllib/policy/jax/jax_policy_template.py
 rename rllib/policy/{jax => }/jax_policy.py (100%)

diff --git a/rllib/agents/a3c/a3c_torch_policy.py b/rllib/agents/a3c/a3c_torch_policy.py
index a4d980bac730..5eb83eb5f46d 100644
--- a/rllib/agents/a3c/a3c_torch_policy.py
+++ b/rllib/agents/a3c/a3c_torch_policy.py
@@ -1,8 +1,8 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.framework import try_import_torch
 
 torch, nn = try_import_torch()
@@ -84,8 +84,9 @@ def _value(self, obs):
         return self.model.value_function()[0]
 
 
-A3CTorchPolicy = build_torch_policy(
+A3CTorchPolicy = build_policy_class(
     name="A3CTorchPolicy",
+    framework="torch",
     get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
     loss_fn=actor_critic_loss,
     stats_fn=loss_and_entropy_stats,
diff --git a/rllib/agents/ars/ars_torch_policy.py b/rllib/agents/ars/ars_torch_policy.py
index 809b435b6765..7b7140a48932 100644
--- a/rllib/agents/ars/ars_torch_policy.py
+++ b/rllib/agents/ars/ars_torch_policy.py
@@ -4,10 +4,11 @@
 import ray
 from ray.rllib.agents.es.es_torch_policy import after_init, before_init, \
     make_model_and_action_dist
-from ray.rllib.policy.torch_policy_template import build_torch_policy
+from ray.rllib.policy.policy_template import build_policy_class
 
-ARSTorchPolicy = build_torch_policy(
+ARSTorchPolicy = build_policy_class(
     name="ARSTorchPolicy",
+    framework="torch",
     loss_fn=None,
     get_default_config=lambda: ray.rllib.agents.ars.ars.DEFAULT_CONFIG,
     before_init=before_init,
diff --git a/rllib/agents/ddpg/ddpg_torch_model.py b/rllib/agents/ddpg/ddpg_torch_model.py
index a24b949207a9..f3108c855771 100644
--- a/rllib/agents/ddpg/ddpg_torch_model.py
+++ b/rllib/agents/ddpg/ddpg_torch_model.py
@@ -2,7 +2,8 @@
 
 from ray.rllib.models.torch.misc import SlimFC
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
-from ray.rllib.utils.framework import try_import_torch, get_activation_fn
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.framework import try_import_torch
 
 torch, nn = try_import_torch()
 
diff --git a/rllib/agents/ddpg/ddpg_torch_policy.py b/rllib/agents/ddpg/ddpg_torch_policy.py
index 39123783a421..79be4cce823c 100644
--- a/rllib/agents/ddpg/ddpg_torch_policy.py
+++ b/rllib/agents/ddpg/ddpg_torch_policy.py
@@ -7,8 +7,8 @@
 from ray.rllib.agents.dqn.dqn_tf_policy import postprocess_nstep_and_prio, \
     PRIO_WEIGHTS
 from ray.rllib.models.torch.torch_action_dist import TorchDeterministic
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.torch_ops import huber_loss, l2_loss
 
@@ -264,8 +264,9 @@ def setup_late_mixins(policy, obs_space, action_space, config):
     TargetNetworkMixin.__init__(policy)
 
 
-DDPGTorchPolicy = build_torch_policy(
+DDPGTorchPolicy = build_policy_class(
     name="DDPGTorchPolicy",
+    framework="torch",
     loss_fn=ddpg_actor_critic_loss,
     get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG,
     stats_fn=build_ddpg_stats,
diff --git a/rllib/agents/dqn/dqn_torch_policy.py b/rllib/agents/dqn/dqn_torch_policy.py
index 9b9d535d95b4..1ed468e1d883 100644
--- a/rllib/agents/dqn/dqn_torch_policy.py
+++ b/rllib/agents/dqn/dqn_torch_policy.py
@@ -14,9 +14,9 @@
 from ray.rllib.models.torch.torch_action_dist import (TorchCategorical,
                                                       TorchDistributionWrapper)
 from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.torch_policy import LearningRateSchedule
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.exploration.parameter_noise import ParameterNoise
 from ray.rllib.utils.framework import try_import_torch
@@ -384,8 +384,9 @@ def extra_action_out_fn(policy: Policy, input_dict, state_batches, model,
     return {"q_values": policy.q_values}
 
 
-DQNTorchPolicy = build_torch_policy(
+DQNTorchPolicy = build_policy_class(
     name="DQNTorchPolicy",
+    framework="torch",
     loss_fn=build_q_losses,
     get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
     make_model_and_action_dist=build_q_model_and_distribution,
diff --git a/rllib/agents/dqn/simple_q_torch_policy.py b/rllib/agents/dqn/simple_q_torch_policy.py
index b9ec0f0c41f2..9862f82b7974 100644
--- a/rllib/agents/dqn/simple_q_torch_policy.py
+++ b/rllib/agents/dqn/simple_q_torch_policy.py
@@ -11,8 +11,8 @@
 from ray.rllib.models.torch.torch_action_dist import TorchCategorical, \
     TorchDistributionWrapper
 from ray.rllib.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.torch_ops import huber_loss
 from ray.rllib.utils.typing import TensorType, TrainerConfigDict
@@ -127,8 +127,9 @@ def setup_late_mixins(policy: Policy, obs_space: gym.spaces.Space,
     TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
 
 
-SimpleQTorchPolicy = build_torch_policy(
+SimpleQTorchPolicy = build_policy_class(
     name="SimpleQPolicy",
+    framework="torch",
     loss_fn=build_q_losses,
     get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
     extra_action_out_fn=extra_action_out_fn,
diff --git a/rllib/agents/dreamer/dreamer_torch_policy.py b/rllib/agents/dreamer/dreamer_torch_policy.py
index f9abd10c871a..d23ad9c3088d 100644
--- a/rllib/agents/dreamer/dreamer_torch_policy.py
+++ b/rllib/agents/dreamer/dreamer_torch_policy.py
@@ -1,11 +1,11 @@
 import logging
 
 import ray
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.agents.a3c.a3c_torch_policy import apply_grad_clipping
-from ray.rllib.utils.framework import try_import_torch
-from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.agents.dreamer.utils import FreezeParameters
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.policy.policy_template import build_policy_class
+from ray.rllib.utils.framework import try_import_torch
 
 torch, nn = try_import_torch()
 if torch:
@@ -236,8 +236,9 @@ def dreamer_optimizer_fn(policy, config):
     return (model_opt, actor_opt, critic_opt)
 
 
-DreamerTorchPolicy = build_torch_policy(
+DreamerTorchPolicy = build_policy_class(
     name="DreamerTorchPolicy",
+    framework="torch",
     get_default_config=lambda: ray.rllib.agents.dreamer.dreamer.DEFAULT_CONFIG,
     action_sampler_fn=action_sampler_fn,
     loss_fn=dreamer_loss,
diff --git a/rllib/agents/es/es_torch_policy.py b/rllib/agents/es/es_torch_policy.py
index 6f7e374c9873..444735e0b090 100644
--- a/rllib/agents/es/es_torch_policy.py
+++ b/rllib/agents/es/es_torch_policy.py
@@ -7,8 +7,8 @@
 
 import ray
 from ray.rllib.models import ModelCatalog
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.filter import get_filter
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space, \
@@ -126,8 +126,9 @@ def make_model_and_action_dist(policy, observation_space, action_space,
     return model, dist_class
 
 
-ESTorchPolicy = build_torch_policy(
+ESTorchPolicy = build_policy_class(
     name="ESTorchPolicy",
+    framework="torch",
     loss_fn=None,
     get_default_config=lambda: ray.rllib.agents.es.es.DEFAULT_CONFIG,
     before_init=before_init,
diff --git a/rllib/agents/impala/vtrace_torch_policy.py b/rllib/agents/impala/vtrace_torch_policy.py
index a42707e4d40f..5e0990008f77 100644
--- a/rllib/agents/impala/vtrace_torch_policy.py
+++ b/rllib/agents/impala/vtrace_torch_policy.py
@@ -6,10 +6,10 @@
 from ray.rllib.agents.a3c.a3c_torch_policy import apply_grad_clipping
 import ray.rllib.agents.impala.vtrace_torch as vtrace
 from ray.rllib.models.torch.torch_action_dist import TorchCategorical
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.torch_policy import LearningRateSchedule, \
     EntropyCoeffSchedule
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.torch_ops import explained_variance, global_norm, \
     sequence_mask
@@ -260,8 +260,9 @@ def setup_mixins(policy, obs_space, action_space, config):
     LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
 
 
-VTraceTorchPolicy = build_torch_policy(
+VTraceTorchPolicy = build_policy_class(
     name="VTraceTorchPolicy",
+    framework="torch",
     loss_fn=build_vtrace_loss,
     get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG,
     stats_fn=stats,
diff --git a/rllib/agents/maml/maml_tf_policy.py b/rllib/agents/maml/maml_tf_policy.py
index 7aff4c426575..9d33d444e422 100644
--- a/rllib/agents/maml/maml_tf_policy.py
+++ b/rllib/agents/maml/maml_tf_policy.py
@@ -2,13 +2,13 @@
 
 import ray
 from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.models.utils import get_activation_fn
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.utils import try_import_tf
 from ray.rllib.agents.ppo.ppo_tf_policy import postprocess_ppo_gae, \
     vf_preds_fetches, compute_and_clip_gradients, setup_config, \
     ValueNetworkMixin
-from ray.rllib.utils.framework import get_activation_fn
 
 tf1, tf, tfv = try_import_tf()
 
diff --git a/rllib/agents/maml/maml_torch_policy.py b/rllib/agents/maml/maml_torch_policy.py
index 182ac8c25af3..478d95ba65ab 100644
--- a/rllib/agents/maml/maml_torch_policy.py
+++ b/rllib/agents/maml/maml_torch_policy.py
@@ -2,8 +2,8 @@
 
 import ray
 from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.agents.ppo.ppo_tf_policy import postprocess_ppo_gae, \
     setup_config
 from ray.rllib.agents.ppo.ppo_torch_policy import vf_preds_fetches, \
@@ -347,8 +347,9 @@ def setup_mixins(policy, obs_space, action_space, config):
     KLCoeffMixin.__init__(policy, config)
 
 
-MAMLTorchPolicy = build_torch_policy(
+MAMLTorchPolicy = build_policy_class(
     name="MAMLTorchPolicy",
+    framework="torch",
     get_default_config=lambda: ray.rllib.agents.maml.maml.DEFAULT_CONFIG,
     loss_fn=maml_loss,
     stats_fn=maml_stats,
diff --git a/rllib/agents/marwil/marwil_torch_policy.py b/rllib/agents/marwil/marwil_torch_policy.py
index e88e5e312f40..a64194abf953 100644
--- a/rllib/agents/marwil/marwil_torch_policy.py
+++ b/rllib/agents/marwil/marwil_torch_policy.py
@@ -1,8 +1,8 @@
 import ray
 from ray.rllib.agents.marwil.marwil_tf_policy import postprocess_advantages
 from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.torch_ops import explained_variance
 
@@ -75,8 +75,9 @@ def setup_mixins(policy, obs_space, action_space, config):
     ValueNetworkMixin.__init__(policy)
 
 
-MARWILTorchPolicy = build_torch_policy(
+MARWILTorchPolicy = build_policy_class(
     name="MARWILTorchPolicy",
+    framework="torch",
     loss_fn=marwil_loss,
     get_default_config=lambda: ray.rllib.agents.marwil.marwil.DEFAULT_CONFIG,
     stats_fn=stats,
diff --git a/rllib/agents/mbmpo/mbmpo_torch_policy.py b/rllib/agents/mbmpo/mbmpo_torch_policy.py
index a4682ba81fe7..f43d06ebec5a 100644
--- a/rllib/agents/mbmpo/mbmpo_torch_policy.py
+++ b/rllib/agents/mbmpo/mbmpo_torch_policy.py
@@ -13,7 +13,7 @@
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
 from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.torch_policy_template import build_torch_policy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.typing import TrainerConfigDict
 
@@ -76,8 +76,9 @@ def make_model_and_action_dist(
 
 # Build a child class of `TorchPolicy`, given the custom functions defined
 # above.
-MBMPOTorchPolicy = build_torch_policy(
+MBMPOTorchPolicy = build_policy_class(
     name="MBMPOTorchPolicy",
+    framework="torch",
     get_default_config=lambda: ray.rllib.agents.mbmpo.mbmpo.DEFAULT_CONFIG,
     make_model_and_action_dist=make_model_and_action_dist,
     loss_fn=maml_loss,
diff --git a/rllib/agents/pg/pg_torch_policy.py b/rllib/agents/pg/pg_torch_policy.py
index be65f9e91c84..d707f01f2364 100644
--- a/rllib/agents/pg/pg_torch_policy.py
+++ b/rllib/agents/pg/pg_torch_policy.py
@@ -10,8 +10,8 @@
 from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.typing import TensorType
 
@@ -72,8 +72,9 @@ def pg_loss_stats(policy: Policy,
 # Build a child class of `TFPolicy`, given the extra options:
 # - trajectory post-processing function (to calculate advantages)
 # - PG loss function
-PGTorchPolicy = build_torch_policy(
+PGTorchPolicy = build_policy_class(
     name="PGTorchPolicy",
+    framework="torch",
     get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
     loss_fn=pg_torch_loss,
     stats_fn=pg_loss_stats,
diff --git a/rllib/agents/ppo/appo_torch_policy.py b/rllib/agents/ppo/appo_torch_policy.py
index f24dfc6d1b54..461886dbec2b 100644
--- a/rllib/agents/ppo/appo_torch_policy.py
+++ b/rllib/agents/ppo/appo_torch_policy.py
@@ -23,9 +23,9 @@
 from ray.rllib.models.torch.torch_action_dist import \
     TorchDistributionWrapper, TorchCategorical
 from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.torch_policy import LearningRateSchedule
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.torch_ops import explained_variance, global_norm, \
     sequence_mask
@@ -322,8 +322,9 @@ def setup_late_mixins(policy: Policy, obs_space: gym.spaces.Space,
 
 # Build a child class of `TorchPolicy`, given the custom functions defined
 # above.
-AsyncPPOTorchPolicy = build_torch_policy(
+AsyncPPOTorchPolicy = build_policy_class(
     name="AsyncPPOTorchPolicy",
+    framework="torch",
     loss_fn=appo_surrogate_loss,
     stats_fn=stats,
     postprocess_fn=postprocess_trajectory,
diff --git a/rllib/agents/qmix/qmix_policy.py b/rllib/agents/qmix/qmix_policy.py
index 6b533378ac8a..539efa2c5e35 100644
--- a/rllib/agents/qmix/qmix_policy.py
+++ b/rllib/agents/qmix/qmix_policy.py
@@ -143,7 +143,7 @@ def forward(self,
         return loss, mask, masked_td_error, chosen_action_qvals, targets
 
 
-# TODO(sven): Make this a TorchPolicy child via `build_torch_policy`.
+# TODO(sven): Make this a TorchPolicy child via `build_policy_class`.
 class QMixTorchPolicy(Policy):
     """QMix impl. Assumes homogeneous agents for now.
 
diff --git a/rllib/agents/sac/sac_torch_model.py b/rllib/agents/sac/sac_torch_model.py
index 9ebb8c75f99d..5f8b05980fed 100644
--- a/rllib/agents/sac/sac_torch_model.py
+++ b/rllib/agents/sac/sac_torch_model.py
@@ -5,7 +5,8 @@
 
 from ray.rllib.models.torch.misc import SlimFC
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
-from ray.rllib.utils.framework import get_activation_fn, try_import_torch
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.spaces.simplex import Simplex
 from ray.rllib.utils.typing import ModelConfigDict, TensorType
 
diff --git a/rllib/agents/sac/sac_torch_policy.py b/rllib/agents/sac/sac_torch_policy.py
index a1a8f996bc23..d1d53697ba2f 100644
--- a/rllib/agents/sac/sac_torch_policy.py
+++ b/rllib/agents/sac/sac_torch_policy.py
@@ -17,8 +17,8 @@
 from ray.rllib.models.torch.torch_action_dist import \
     TorchDistributionWrapper, TorchDirichlet
 from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.models.torch.torch_action_dist import (
     TorchCategorical, TorchSquashedGaussian, TorchDiagGaussian, TorchBeta)
 from ray.rllib.utils.framework import try_import_torch
@@ -480,8 +480,9 @@ def setup_late_mixins(policy: Policy, obs_space: gym.spaces.Space,
 
 # Build a child class of `TorchPolicy`, given the custom functions defined
 # above.
-SACTorchPolicy = build_torch_policy(
+SACTorchPolicy = build_policy_class(
     name="SACTorchPolicy",
+    framework="torch",
     loss_fn=actor_critic_loss,
     get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG,
     stats_fn=stats,
diff --git a/rllib/agents/slateq/slateq_torch_policy.py b/rllib/agents/slateq/slateq_torch_policy.py
index 0afb7cb12031..d6bb0af67983 100644
--- a/rllib/agents/slateq/slateq_torch_policy.py
+++ b/rllib/agents/slateq/slateq_torch_policy.py
@@ -11,8 +11,8 @@
                                                       TorchDistributionWrapper)
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.typing import (ModelConfigDict, TensorType,
                                     TrainerConfigDict)
@@ -403,8 +403,10 @@ def postprocess_fn_add_next_actions_for_sarsa(policy: Policy,
     return batch
 
 
-SlateQTorchPolicy = build_torch_policy(
+SlateQTorchPolicy = build_policy_class(
     name="SlateQTorchPolicy",
+    framework="torch",
+
     get_default_config=lambda: ray.rllib.agents.slateq.slateq.DEFAULT_CONFIG,
 
     # build model, loss functions, and optimizers
diff --git a/rllib/contrib/bandits/agents/policy.py b/rllib/contrib/bandits/agents/policy.py
index 2a9b50137381..e47c91005232 100644
--- a/rllib/contrib/bandits/agents/policy.py
+++ b/rllib/contrib/bandits/agents/policy.py
@@ -10,9 +10,9 @@
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.models.modelv2 import restore_original_dimensions
 from ray.rllib.policy.policy import LEARNER_STATS_KEY
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.torch_policy import TorchPolicy
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.utils.annotations import override
 from ray.util.debug import log_once
 
@@ -109,8 +109,9 @@ def init_cum_regret(policy, *args):
     policy.regrets = []
 
 
-BanditPolicy = build_torch_policy(
+BanditPolicy = build_policy_class(
     name="BanditPolicy",
+    framework="torch",
     get_default_config=lambda: DEFAULT_CONFIG,
     loss_fn=None,
     after_init=init_cum_regret,
diff --git a/rllib/examples/custom_torch_policy.py b/rllib/examples/custom_torch_policy.py
index 1cea6aa1cf51..7e2937c11060 100644
--- a/rllib/examples/custom_torch_policy.py
+++ b/rllib/examples/custom_torch_policy.py
@@ -4,8 +4,8 @@
 import ray
 from ray import tune
 from ray.rllib.agents.trainer_template import build_trainer
+from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--stop-iters", type=int, default=200)
@@ -20,8 +20,10 @@ def policy_gradient_loss(policy, model, dist_class, train_batch):
 
 
 # <class 'ray.rllib.policy.torch_policy_template.MyTorchPolicy'>
-MyTorchPolicy = build_torch_policy(
-    name="MyTorchPolicy", loss_fn=policy_gradient_loss)
+MyTorchPolicy = build_policy_class(
+    name="MyTorchPolicy",
+    framework="torch",
+    loss_fn=policy_gradient_loss)
 
 # <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
 MyTrainer = build_trainer(
diff --git a/rllib/models/jax/fcnet.py b/rllib/models/jax/fcnet.py
index b332bf7f57a5..a9789d67472c 100644
--- a/rllib/models/jax/fcnet.py
+++ b/rllib/models/jax/fcnet.py
@@ -3,7 +3,7 @@
 import time
 
 from ray.rllib.models.jax.jax_modelv2 import JAXModelV2
-from ray.rllib.models.jax.modules import FCStack
+from ray.rllib.models.jax.modules.fc_stack import FCStack
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_jax
 
@@ -20,8 +20,6 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
         super().__init__(obs_space, action_space, num_outputs, model_config,
                          name)
 
-        self.key = jax.random.PRNGKey(int(time.time()))
-
         activation = model_config.get("fcnet_activation")
         hiddens = model_config.get("fcnet_hiddens", [])
         no_final_linear = model_config.get("no_final_linear")
@@ -41,23 +39,31 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
                 in_features=in_features,
                 layers=hiddens + [num_outputs],
                 activation=activation,
+                prng_key=self.prng_key,
             )
 
         # Finish the layers with the provided sizes (`hiddens`), plus -
         # iff num_outputs > 0 - a last linear layer of size num_outputs.
         else:
+            prev_layer_size = in_features
             if len(hiddens) > 0:
                 self._hidden_layers = FCStack(
-                    in_features=in_features,
+                    in_features=prev_layer_size,
                     layers=hiddens,
-                    activation=activation
+                    activation=activation,
+                    prng_key=self.prng_key,
                 )
+                #TODO
+                import jax.numpy as jnp
+                in_ = jnp.zeros((10, in_features))
+                vars = self._hidden_layers.init(self.prng_key, in_)
                 prev_layer_size = hiddens[-1]
             if num_outputs:
                 self._logits = FCStack(
                     in_features=prev_layer_size,
                     layers=[num_outputs],
                     activation=None,
+                    prng_key=self.prng_key,
                 )
             else:
                 self.num_outputs = (
@@ -70,9 +76,13 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
                 in_features=int(np.product(obs_space.shape)),
                 layers=hiddens,
                 activation=activation,
+                prng_key=self.prng_key,
             )
         self._value_branch = FCStack(
-            in_features=prev_layer_size, layers=[1])
+            in_features=prev_layer_size,
+            layers=[1],
+            prng_key=self.prng_key,
+        )
         # Holds the current "base" output (before logits layer).
         self._features = None
         # Holds the last input, in case value branch is separate.
diff --git a/rllib/models/jax/jax_modelv2.py b/rllib/models/jax/jax_modelv2.py
index c6cd9643c40d..96ef98b818ec 100644
--- a/rllib/models/jax/jax_modelv2.py
+++ b/rllib/models/jax/jax_modelv2.py
@@ -1,4 +1,5 @@
 import gym
+import time
 from typing import Dict, List, Union
 
 from ray.rllib.models.modelv2 import ModelV2
@@ -36,6 +37,8 @@ def __init__(self, obs_space: gym.spaces.Space,
             name,
             framework="jax")
 
+        self.prng_key = jax.random.PRNGKey(int(time.time()))
+
     @PublicAPI
     @override(ModelV2)
     def variables(self, as_dict: bool = False
diff --git a/rllib/models/jax/misc.py b/rllib/models/jax/misc.py
index 106bf542e87f..09415f7a0747 100644
--- a/rllib/models/jax/misc.py
+++ b/rllib/models/jax/misc.py
@@ -1,7 +1,8 @@
 import time
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
-from ray.rllib.utils.framework import get_activation_fn, try_import_jax
+from ray.rllib.models.utils import get_activation_fn, get_initializer
+from ray.rllib.utils.framework import try_import_jax
 
 jax, flax = try_import_jax()
 nn = jnp = None
@@ -11,59 +12,50 @@
 
 
 class SlimFC(nn.Module if nn else object):
-    """Simple JAX version of a fully connected layer."""
+    """Simple JAX version of a fully connected layer.
 
-    def __init__(self,
-                 in_size,
-                 out_size,
-                 initializer: Optional[Callable] = None,
-                 activation_fn: Optional[str] = None,
-                 use_bias: bool = True,
-                 prng_key: Optional[jax.random.PRNGKey] = None,
-                 name: Optional[str] = None):
-        """Initializes a SlimFC instance.
+    Properties:
+        in_size (int): The input size of the input data that will be passed
+            into this layer.
+        out_size (int): The number of nodes in this FC layer.
+        initializer (flax.:
+        activation (str): An activation string specifier, e.g. "relu".
+        use_bias (bool): Whether to add biases to the dot product or not.
+        #bias_init (float):
+        prng_key (Optional[jax.random.PRNGKey]): An optional PRNG key to
+            use for initialization. If None, create a new random one.
+    """
 
-        Args:
-            in_size (int): The input size of the input data that will be passed
-                into this layer.
-            out_size (int): The number of nodes in this FC layer.
-            initializer (flax.:
-            activation_fn (str): An activation string specifier, e.g. "relu".
-            use_bias (bool): Whether to add biases to the dot product or not.
-            #bias_init (float):
-            prng_key (Optional[jax.random.PRNGKey]): An optional PRNG key to
-                use for initialization. If None, create a new random one.
-            name (Optional[str]): An optional name for this layer.
-        """
-        self.in_size = in_size
-        self.out_size = out_size
-        self.use_bias = use_bias
-        self.name = name
+    in_size: int
+    out_size: int
+    initializer: Optional[Union[Callable, str]] = None
+    activation: Optional[Union[Callable, str]] = None
+    use_bias: bool = True
+    prng_key: Optional[jax.random.PRNGKey] = None
 
+    def setup(self):
         # By default, use Glorot unform initializer.
-        if initializer is None:
-            initializer = flax.nn.initializers.xavier_uniform()
-        self.initializer = initializer
+        if self.initializer is None:
+            self.initializer = "xavier_uniform"
 
-        self.prng_key = prng_key or jax.random.PRNGKey(int(time.time()))
-        _, self.prng_key = jax.random.split(self.prng_key)
+        if self.prng_key is None:
+            self.prng_key = jax.random.PRNGKey(int(time.time()))
+        #_, self.prng_key = jax.random.split(self.prng_key)
 
         # Activation function (if any; default=None (linear)).
-        self.activation_fn = get_activation_fn(activation_fn, "jax")
+        self.initializer_fn = get_initializer(self.initializer, framework="jax")
+        self.activation_fn = get_activation_fn(self.activation, framework="jax")
 
-    def setup(self):
         # Create the flax dense layer.
         self._dense = nn.Dense(
             self.out_size,
             use_bias=self.use_bias,
-            kernel_init=self.initializer,
-            name=self.name,
+            kernel_init=self.initializer_fn,
         )
         # Initialize it.
-        dummy_in = jax.random.normal(
-            self.prng_key, (self.in_size, ), dtype=jnp.float32)
-        _, self.prng_key = jax.random.split(self.prng_key)
-        self._params = self._dense.init(self.prng_key, dummy_in)
+        in_ = jnp.zeros((self.in_size, ))
+        #_, self.prng_key = jax.random.split(self.prng_key)
+        self._params = self._dense.init(self.prng_key, in_)
 
     def __call__(self, x):
         out = self._dense.apply(self._params, x)
diff --git a/rllib/models/jax/modules/fc_stack.py b/rllib/models/jax/modules/fc_stack.py
index 54ae9deab730..59ca352a732f 100644
--- a/rllib/models/jax/modules/fc_stack.py
+++ b/rllib/models/jax/modules/fc_stack.py
@@ -1,8 +1,10 @@
 import logging
 import numpy as np
 import time
+from typing import Callable, Optional, Union
 
-from ray.rllib.models.jax.misc import get_activation_fn, SlimFC
+from ray.rllib.models.jax.misc import SlimFC
+from ray.rllib.models.utils import get_activation_fn, get_initializer
 from ray.rllib.utils.framework import try_import_jax
 
 jax, flax = try_import_jax()
@@ -14,36 +16,41 @@
 
 
 class FCStack(nn.Module if nn else object):
-    """Generic fully connected FLAX module."""
-
-    def __init__(self, in_features, layers, activation=None, prng_key=None):
-        """Initializes a FCStack instance.
-
-        Args:
-            in_features (int): Number of input features (input dim).
-            layers (List[int]): List of Dense layer sizes.
-            activation (Optional[Union[Callable, str]]): An optional activation
-                function or activation function specifier (str), such as
-                "relu". Use None or "linear" for no activation.
-        """
-        super().__init__()
-
-        self.prng_key = prng_key or jax.random.PRNGKey(int(time.time()))
-        activation_fn = get_activation_fn(activation, framework="jax")
+    """Generic fully connected FLAX module.
+
+    Properties:
+        in_features (int): Number of input features (input dim).
+        layers (List[int]): List of Dense layer sizes.
+        activation (Optional[Union[Callable, str]]): An optional activation
+            function or activation function specifier (str), such as
+            "relu". Use None or "linear" for no activation.
+        initializer ():
+    """
+
+    in_features: int
+    layers: []
+    activation: Optional[Union[Callable, str]] = None
+    initializer: Optional[Union[Callable, str]] = None
+    use_bias: bool = True
+    prng_key: Optional[jax.random.PRNGKey] = jax.random.PRNGKey(int(time.time()))
+
+    def setup(self):
+        self.initializer = get_initializer(self.initializer, framework="jax")
 
         # Create all layers.
-        self._layers = []
-        prev_layer_size = in_features
-        for size in layers:
-            self._hidden_layers.append(
-                SlimFC(
-                    in_size=prev_layer_size,
-                    out_size=size,
-                    use_bias=self.use_bias,
-                    initializer=self.initializer,
-                    activation_fn=activation_fn,
-                    prng_key=self.prng_key,
-                ))
+        self._hidden_layers = []
+        prev_layer_size = self.in_features
+        for i, size in enumerate(self.layers):
+            slim_fc = SlimFC(
+                in_size=prev_layer_size,
+                out_size=size,
+                use_bias=self.use_bias,
+                initializer=self.initializer,
+                activation=self.activation,
+                prng_key=self.prng_key,
+            )
+            setattr(self, "fc_{}".format(i), slim_fc)
+            self._hidden_layers.append(slim_fc)
             prev_layer_size = size
 
     def __call__(self, inputs):
diff --git a/rllib/models/tests/test_jax_models.py b/rllib/models/tests/test_jax_models.py
index c5179714533c..255ff4b6493d 100644
--- a/rllib/models/tests/test_jax_models.py
+++ b/rllib/models/tests/test_jax_models.py
@@ -1,193 +1,35 @@
-from functools import partial
-import numpy as np
-from gym.spaces import Box, Dict, Tuple
-from scipy.stats import beta, norm
-import tree
+from gym.spaces import Box, Discrete
 import unittest
 
-from ray.rllib.models.jax.jax_action_dist import JAXCategorical
-from ray.rllib.models.tf.tf_action_dist import Beta, Categorical, \
-    DiagGaussian, GumbelSoftmax, MultiActionDistribution, MultiCategorical, \
-    SquashedGaussian
-from ray.rllib.models.torch.torch_action_dist import TorchBeta, \
-    TorchCategorical, TorchDiagGaussian, TorchMultiActionDistribution, \
-    TorchMultiCategorical, TorchSquashedGaussian
-from ray.rllib.utils.framework import try_import_tf, try_import_torch
-from ray.rllib.utils.numpy import MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT, \
-    softmax, SMALL_NUMBER, LARGE_INTEGER
-from ray.rllib.utils.test_utils import check, framework_iterator
+from ray.rllib.models.jax.fcnet import FullyConnectedNetwork
+from ray.rllib.utils.framework import try_import_jax
 
-tf1, tf, tfv = try_import_tf()
-torch, _ = try_import_torch()
+
+jax, flax = try_import_jax()
+jnp = None
+if jax:
+    import jax.numpy as jnp
 
 
 class TestJAXModels(unittest.TestCase):
 
     def test_jax_fcnet(self):
-        """Tests the MultiActionDistribution (across all frameworks)."""
+        """Tests the JAX FCNet class."""
         batch_size = 1000
-        input_space = Tuple([
-            Box(-10.0, 10.0, shape=(batch_size, 4)),
-            Box(-2.0, 2.0, shape=(
-                batch_size,
-                6,
-            )),
-            Dict({
-                "a": Box(-1.0, 1.0, shape=(batch_size, 4))
-            }),
-        ])
-        std_space = Box(
-            -0.05, 0.05, shape=(
-                batch_size,
-                3,
-            ))
-
-        low, high = -1.0, 1.0
-        value_space = Tuple([
-            Box(0, 3, shape=(batch_size, ), dtype=np.int32),
-            Box(-2.0, 2.0, shape=(batch_size, 3), dtype=np.float32),
-            Dict({
-                "a": Box(0.0, 1.0, shape=(batch_size, 2), dtype=np.float32)
-            })
-        ])
-
-        for fw, sess in framework_iterator(session=True):
-            if fw == "torch":
-                cls = TorchMultiActionDistribution
-                child_distr_cls = [
-                    TorchCategorical, TorchDiagGaussian,
-                    partial(TorchBeta, low=low, high=high)
-                ]
-            else:
-                cls = MultiActionDistribution
-                child_distr_cls = [
-                    Categorical,
-                    DiagGaussian,
-                    partial(Beta, low=low, high=high),
-                ]
-
-            inputs = list(input_space.sample())
-            distr = cls(
-                np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1),
-                model={},
-                action_space=value_space,
-                child_distributions=child_distr_cls,
-                input_lens=[4, 6, 4])
-
-            # Adjust inputs for the Beta distr just as Beta itself does.
-            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
-                                     -np.log(SMALL_NUMBER))
-            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
-            # Sample deterministically.
-            expected_det = [
-                np.argmax(inputs[0], axis=-1),
-                inputs[1][:, :3],  # [:3]=Mean values.
-                # Mean for a Beta distribution:
-                # 1 / [1 + (beta/alpha)] * range + low
-                (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2]))
-                * (high - low) + low,
-            ]
-            out = distr.deterministic_sample()
-            if sess:
-                out = sess.run(out)
-            check(out[0], expected_det[0])
-            check(out[1], expected_det[1])
-            check(out[2]["a"], expected_det[2])
-
-            # Stochastic sampling -> expect roughly the mean.
-            inputs = list(input_space.sample())
-            # Fix categorical inputs (not needed for distribution itself, but
-            # for our expectation calculations).
-            inputs[0] = softmax(inputs[0], -1)
-            # Fix std inputs (shouldn't be too large for this test).
-            inputs[1][:, 3:] = std_space.sample()
-            # Adjust inputs for the Beta distr just as Beta itself does.
-            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
-                                     -np.log(SMALL_NUMBER))
-            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
-            distr = cls(
-                np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1),
-                model={},
-                action_space=value_space,
-                child_distributions=child_distr_cls,
-                input_lens=[4, 6, 4])
-            expected_mean = [
-                np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)),
-                inputs[1][:, :3],  # [:3]=Mean values.
-                # Mean for a Beta distribution:
-                # 1 / [1 + (beta/alpha)] * range + low
-                (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) *
-                (high - low) + low,
-            ]
-            out = distr.sample()
-            if sess:
-                out = sess.run(out)
-            out = list(out)
-            if fw == "torch":
-                out[0] = out[0].numpy()
-                out[1] = out[1].numpy()
-                out[2]["a"] = out[2]["a"].numpy()
-            check(np.mean(out[0]), expected_mean[0], decimals=1)
-            check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1)
-            check(
-                np.mean(out[2]["a"], 0),
-                np.mean(expected_mean[2], 0),
-                decimals=1)
-
-            # Test log-likelihood outputs.
-            # Make sure beta-values are within 0.0 and 1.0 for the numpy
-            # calculation (which doesn't have scaling).
-            inputs = list(input_space.sample())
-            # Adjust inputs for the Beta distr just as Beta itself does.
-            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
-                                     -np.log(SMALL_NUMBER))
-            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
-            distr = cls(
-                np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1),
-                model={},
-                action_space=value_space,
-                child_distributions=child_distr_cls,
-                input_lens=[4, 6, 4])
-            inputs[0] = softmax(inputs[0], -1)
-            values = list(value_space.sample())
-            log_prob_beta = np.log(
-                beta.pdf(values[2]["a"], inputs[2]["a"][:, :2],
-                         inputs[2]["a"][:, 2:]))
-            # Now do the up-scaling for [2] (beta values) to be between
-            # low/high.
-            values[2]["a"] = values[2]["a"] * (high - low) + low
-            inputs[1][:, 3:] = np.exp(inputs[1][:, 3:])
-            expected_log_llh = np.sum(
-                np.concatenate([
-                    np.expand_dims(
-                        np.log(
-                            [i[values[0][j]]
-                             for j, i in enumerate(inputs[0])]), -1),
-                    np.log(
-                        norm.pdf(values[1], inputs[1][:, :3],
-                                 inputs[1][:, 3:])), log_prob_beta
-                ], -1), -1)
-
-            values[0] = np.expand_dims(values[0], -1)
-            if fw == "torch":
-                values = tree.map_structure(lambda s: torch.Tensor(s), values)
-            # Test all flattened input.
-            concat = np.concatenate(tree.flatten(values),
-                                    -1).astype(np.float32)
-            out = distr.logp(concat)
-            if sess:
-                out = sess.run(out)
-            check(out, expected_log_llh, atol=15)
-            # Test structured input.
-            out = distr.logp(values)
-            if sess:
-                out = sess.run(out)
-            check(out, expected_log_llh, atol=15)
-            # Test flattened input.
-            out = distr.logp(tree.flatten(values))
-            if sess:
-                out = sess.run(out)
-            check(out, expected_log_llh, atol=15)
+        obs_space = Box(-10.0, 10.0, shape=(4, ))
+        action_space = Discrete(2)
+        fc_net = FullyConnectedNetwork(
+            obs_space,
+            action_space,
+            num_outputs=2,
+            model_config={
+                "fcnet_hiddens": [10],
+                "fcnet_activation": "relu",
+            },
+            name="jax_model"
+        )
+        inputs = jnp.array([obs_space.sample()])
+        print(fc_net({"obs": inputs}))
 
 
 if __name__ == "__main__":
diff --git a/rllib/models/tf/fcnet.py b/rllib/models/tf/fcnet.py
index 0f72c546bc15..e556741ddd22 100644
--- a/rllib/models/tf/fcnet.py
+++ b/rllib/models/tf/fcnet.py
@@ -3,7 +3,8 @@
 
 from ray.rllib.models.tf.misc import normc_initializer
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.utils.framework import get_activation_fn, try_import_tf
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import Dict, TensorType, List, ModelConfigDict
 
 tf1, tf, tfv = try_import_tf()
diff --git a/rllib/models/tf/layers/noisy_layer.py b/rllib/models/tf/layers/noisy_layer.py
index 49b11e0c62e9..4498995e0226 100644
--- a/rllib/models/tf/layers/noisy_layer.py
+++ b/rllib/models/tf/layers/noisy_layer.py
@@ -1,8 +1,8 @@
 import numpy as np
 
-from ray.rllib.utils.framework import get_activation_fn, get_variable, \
-    try_import_tf
-from ray.rllib.utils.framework import TensorType, TensorShape
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.framework import get_variable, try_import_tf, \
+    TensorType, TensorShape
 
 tf1, tf, tfv = try_import_tf()
 
diff --git a/rllib/models/tf/visionnet.py b/rllib/models/tf/visionnet.py
index e09668b49396..c2a8de5d2c97 100644
--- a/rllib/models/tf/visionnet.py
+++ b/rllib/models/tf/visionnet.py
@@ -3,8 +3,8 @@
 
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.models.tf.misc import normc_initializer
-from ray.rllib.models.utils import get_filter_config
-from ray.rllib.utils.framework import get_activation_fn, try_import_tf
+from ray.rllib.models.utils import get_activation_fn, get_filter_config
+from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import ModelConfigDict, TensorType
 
 tf1, tf, tfv = try_import_tf()
diff --git a/rllib/models/torch/misc.py b/rllib/models/torch/misc.py
index 307d7644179e..830e8bc33b5e 100644
--- a/rllib/models/torch/misc.py
+++ b/rllib/models/torch/misc.py
@@ -2,7 +2,8 @@
 import numpy as np
 from typing import Union, Tuple, Any, List
 
-from ray.rllib.utils.framework import get_activation_fn, try_import_torch
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.typing import TensorType
 
 torch, nn = try_import_torch()
diff --git a/rllib/models/torch/modules/convtranspose2d_stack.py b/rllib/models/torch/modules/convtranspose2d_stack.py
index 0eab6dc7cf16..4fc1735a6700 100644
--- a/rllib/models/torch/modules/convtranspose2d_stack.py
+++ b/rllib/models/torch/modules/convtranspose2d_stack.py
@@ -1,8 +1,8 @@
 from typing import Tuple
 
 from ray.rllib.models.torch.misc import Reshape
-from ray.rllib.models.utils import get_initializer
-from ray.rllib.utils.framework import get_activation_fn, try_import_torch
+from ray.rllib.models.utils import get_activation_fn, get_initializer
+from ray.rllib.utils.framework import try_import_torch
 
 torch, nn = try_import_torch()
 if torch:
diff --git a/rllib/models/torch/modules/noisy_layer.py b/rllib/models/torch/modules/noisy_layer.py
index ee553c73c89b..f980dba0412a 100644
--- a/rllib/models/torch/modules/noisy_layer.py
+++ b/rllib/models/torch/modules/noisy_layer.py
@@ -1,7 +1,7 @@
 import numpy as np
 
-from ray.rllib.utils.framework import get_activation_fn, try_import_torch
-from ray.rllib.utils.framework import TensorType
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.framework import try_import_torch, TensorType
 
 torch, nn = try_import_torch()
 
diff --git a/rllib/models/utils.py b/rllib/models/utils.py
index 2c9f076f0ebe..ed50ce08c986 100644
--- a/rllib/models/utils.py
+++ b/rllib/models/utils.py
@@ -1,4 +1,62 @@
-from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from typing import Optional
+
+from ray.rllib.utils.framework import try_import_jax, try_import_tf, \
+    try_import_torch
+
+
+def get_activation_fn(name: Optional[str] = None, framework: str = "tf"):
+    """Returns a framework specific activation function, given a name string.
+
+    Args:
+        name (Optional[str]): One of "relu" (default), "tanh", "swish", or
+            "linear" or None.
+        framework (str): One of "jax", "tf|tfe|tf2" or "torch".
+
+    Returns:
+        A framework-specific activtion function. e.g. tf.nn.tanh or
+            torch.nn.ReLU. None if name in ["linear", None].
+
+    Raises:
+        ValueError: If name is an unknown activation function.
+    """
+    # Already a callable, return as-is.
+    if callable(name):
+        return name
+
+    # Infer the correct activation function from the string specifier.
+    if framework == "torch":
+        if name in ["linear", None]:
+            return None
+        if name == "swish":
+            from ray.rllib.utils.torch_ops import Swish
+            return Swish
+        _, nn = try_import_torch()
+        if name == "relu":
+            return nn.ReLU
+        elif name == "tanh":
+            return nn.Tanh
+    elif framework == "jax":
+        if name in ["linear", None]:
+            return None
+        jax, _ = try_import_jax()
+        if name == "swish":
+            return jax.nn.swish
+        if name == "relu":
+            return jax.nn.relu
+        elif name == "tanh":
+            return jax.nn.hard_tanh
+    else:
+        assert framework in ["tf", "tfe", "tf2"],\
+            "Unsupported framework `{}`!".format(framework)
+        if name in ["linear", None]:
+            return None
+        tf1, tf, tfv = try_import_tf()
+        fn = getattr(tf.nn, name, None)
+        if fn is not None:
+            return fn
+
+    raise ValueError("Unknown activation ({}) for framework={}!".format(
+        name, framework))
 
 
 def get_filter_config(shape):
@@ -40,7 +98,7 @@ def get_initializer(name, framework="tf"):
 
     Args:
         name (str): One of "xavier_uniform" (default), "xavier_normal".
-        framework (str): One of "tf" or "torch".
+        framework (str): One of "jax", "tf|tfe|tf2" or "torch".
 
     Returns:
         A framework-specific initializer function, e.g.
@@ -50,14 +108,33 @@ def get_initializer(name, framework="tf"):
     Raises:
         ValueError: If name is an unknown initializer.
     """
+    # Already a callable, return as-is.
+    if callable(name):
+        return name
+
+    if framework == "jax":
+        _, flax = try_import_jax()
+        assert flax is not None,\
+            "`flax` not installed. Try `pip install jax flax`."
+        import flax.linen as nn
+        if name in [None, "default", "xavier_uniform"]:
+            return nn.initializers.xavier_uniform()
+        elif name == "xavier_normal":
+            return nn.initializers.xavier_normal()
     if framework == "torch":
         _, nn = try_import_torch()
+        assert nn is not None,\
+            "`torch` not installed. Try `pip install torch`."
         if name in [None, "default", "xavier_uniform"]:
             return nn.init.xavier_uniform_
         elif name == "xavier_normal":
             return nn.init.xavier_normal_
     else:
+        assert framework in ["tf", "tfe", "tf2"],\
+            "Unsupported framework `{}`!".format(framework)
         tf1, tf, tfv = try_import_tf()
+        assert tf is not None,\
+            "`tensorflow` not installed. Try `pip install tensorflow`."
         if name in [None, "default", "xavier_uniform"]:
             return tf.keras.initializers.GlorotUniform
         elif name == "xavier_normal":
diff --git a/rllib/policy/__init__.py b/rllib/policy/__init__.py
index ded33e1cac5f..e1a0f50b4898 100644
--- a/rllib/policy/__init__.py
+++ b/rllib/policy/__init__.py
@@ -1,5 +1,5 @@
 from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.jax.jax_policy import JAXPolicy
+from ray.rllib.policy.jax_policy import JAXPolicy
 from ray.rllib.policy.torch_policy import TorchPolicy
 from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.policy.policy_template import build_policy_class
diff --git a/rllib/policy/jax/__init__.py b/rllib/policy/jax/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/rllib/policy/jax/jax_policy_template.py b/rllib/policy/jax/jax_policy_template.py
deleted file mode 100644
index 3d7b03703ee9..000000000000
--- a/rllib/policy/jax/jax_policy_template.py
+++ /dev/null
@@ -1,370 +0,0 @@
-import gym
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
-
-from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.models.jax.jax_action_dist import JAXDistribution
-from ray.rllib.models.jax.jax_modelv2 import JAXModelV2
-from ray.rllib.policy.jax.jax_policy import JAXPolicy
-from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.view_requirement import ViewRequirement
-from ray.rllib.utils import add_mixins, force_list
-from ray.rllib.utils.annotations import override, DeveloperAPI
-from ray.rllib.utils.framework import try_import_jax
-from ray.rllib.utils.torch_ops import convert_to_non_torch_type
-from ray.rllib.utils.typing import TensorType, TrainerConfigDict
-
-jax, _ = try_import_jax()
-
-
-@DeveloperAPI
-def build_jax_policy_class(
-        name: str,
-        *,
-        loss_fn: Optional[Callable[[
-            Policy, ModelV2, Type[JAXDistribution], SampleBatch
-        ], Union[TensorType, List[TensorType]]]],
-        get_default_config: Optional[Callable[[], TrainerConfigDict]] = None,
-        stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[
-            str, TensorType]]] = None,
-        postprocess_fn: Optional[Callable[[
-            Policy, SampleBatch, Optional[Dict[Any, SampleBatch]], Optional[
-                "MultiAgentEpisode"]
-        ], SampleBatch]] = None,
-        extra_action_out_fn: Optional[Callable[[
-            Policy, Dict[str, TensorType], List[TensorType], ModelV2,
-            JAXDistribution
-        ], Dict[str, TensorType]]] = None,
-        extra_grad_process_fn: Optional[Callable[[
-            Policy, "torch.optim.Optimizer", TensorType
-        ], Dict[str, TensorType]]] = None,
-        # TODO: (sven) Replace "fetches" with "process".
-        extra_learn_fetches_fn: Optional[Callable[[Policy], Dict[
-            str, TensorType]]] = None,
-        optimizer_fn: Optional[Callable[[Policy, TrainerConfigDict],
-                                        "torch.optim.Optimizer"]] = None,
-        validate_spaces: Optional[Callable[
-            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
-        before_init: Optional[Callable[
-            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
-        before_loss_init: Optional[Callable[[
-            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
-        ], None]] = None,
-        after_init: Optional[Callable[
-            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
-        _after_loss_init: Optional[Callable[[
-            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
-        ], None]] = None,
-        action_sampler_fn: Optional[Callable[[TensorType, List[
-            TensorType]], Tuple[TensorType, TensorType]]] = None,
-        action_distribution_fn: Optional[Callable[[
-            Policy, ModelV2, TensorType, TensorType, TensorType
-        ], Tuple[TensorType, type, List[TensorType]]]] = None,
-        make_model: Optional[Callable[[
-            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
-        ], ModelV2]] = None,
-        make_model_and_action_dist: Optional[Callable[[
-            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
-        ], Tuple[ModelV2, Type[JAXDistribution]]]] = None,
-        apply_gradients_fn: Optional[Callable[
-            [Policy, "torch.optim.Optimizer"], None]] = None,
-        mixins: Optional[List[type]] = None,
-        view_requirements_fn: Optional[Callable[[Policy], Dict[
-            str, ViewRequirement]]] = None,
-        get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None
-) -> Type[JAXPolicy]:
-    """Helper function for creating a torch policy class at runtime.
-
-    Args:
-        name (str): name of the policy (e.g., "PPOTorchPolicy")
-        loss_fn (Optional[Callable[[Policy, ModelV2,
-            Type[TorchDistributionWrapper], SampleBatch], Union[TensorType,
-            List[TensorType]]]]): Callable that returns a loss tensor.
-        get_default_config (Optional[Callable[[None], TrainerConfigDict]]):
-            Optional callable that returns the default config to merge with any
-            overrides. If None, uses only(!) the user-provided
-            PartialTrainerConfigDict as dict for this Policy.
-        postprocess_fn (Optional[Callable[[Policy, SampleBatch,
-            Optional[Dict[Any, SampleBatch]], Optional["MultiAgentEpisode"]],
-            SampleBatch]]): Optional callable for post-processing experience
-            batches (called after the super's `postprocess_trajectory` method).
-        stats_fn (Optional[Callable[[Policy, SampleBatch],
-            Dict[str, TensorType]]]): Optional callable that returns a dict of
-            values given the policy and training batch. If None,
-            will use `TorchPolicy.extra_grad_info()` instead. The stats dict is
-            used for logging (e.g. in TensorBoard).
-        extra_action_out_fn (Optional[Callable[[Policy, Dict[str, TensorType],
-            List[TensorType], ModelV2, TorchDistributionWrapper]], Dict[str,
-            TensorType]]]): Optional callable that returns a dict of extra
-            values to include in experiences. If None, no extra computations
-            will be performed.
-        extra_grad_process_fn (Optional[Callable[[Policy,
-            "torch.optim.Optimizer", TensorType], Dict[str, TensorType]]]):
-            Optional callable that is called after gradients are computed and
-            returns a processing info dict. If None, will call the
-            `TorchPolicy.extra_grad_process()` method instead.
-        # TODO: (sven) dissolve naming mismatch between "learn" and "compute.."
-        extra_learn_fetches_fn (Optional[Callable[[Policy],
-            Dict[str, TensorType]]]): Optional callable that returns a dict of
-            extra tensors from the policy after loss evaluation. If None,
-            will call the `TorchPolicy.extra_compute_grad_fetches()` method
-            instead.
-        optimizer_fn (Optional[Callable[[Policy, TrainerConfigDict],
-            "torch.optim.Optimizer"]]): Optional callable that returns a
-            torch optimizer given the policy and config. If None, will call
-            the `TorchPolicy.optimizer()` method instead (which returns a
-            torch Adam optimizer).
-        validate_spaces (Optional[Callable[[Policy, gym.Space, gym.Space,
-            TrainerConfigDict], None]]): Optional callable that takes the
-            Policy, observation_space, action_space, and config to check for
-            correctness. If None, no spaces checking will be done.
-        before_init (Optional[Callable[[Policy, gym.Space, gym.Space,
-            TrainerConfigDict], None]]): Optional callable to run at the
-            beginning of `Policy.__init__` that takes the same arguments as
-            the Policy constructor. If None, this step will be skipped.
-        before_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
-            gym.spaces.Space, TrainerConfigDict], None]]): Optional callable to
-            run prior to loss init. If None, this step will be skipped.
-        after_init (Optional[Callable[[Policy, gym.Space, gym.Space,
-            TrainerConfigDict], None]]): DEPRECATED: Use `before_loss_init`
-            instead.
-        _after_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
-            gym.spaces.Space, TrainerConfigDict], None]]): Optional callable to
-            run after the loss init. If None, this step will be skipped.
-            This will be deprecated at some point and renamed into `after_init`
-            to match `build_tf_policy()` behavior.
-        action_sampler_fn (Optional[Callable[[TensorType, List[TensorType]],
-            Tuple[TensorType, TensorType]]]): Optional callable returning a
-            sampled action and its log-likelihood given some (obs and state)
-            inputs. If None, will either use `action_distribution_fn` or
-            compute actions by calling self.model, then sampling from the
-            so parameterized action distribution.
-        action_distribution_fn (Optional[Callable[[Policy, ModelV2, TensorType,
-            TensorType, TensorType], Tuple[TensorType,
-            Type[TorchDistributionWrapper], List[TensorType]]]]): A callable
-            that takes the Policy, Model, the observation batch, an
-            explore-flag, a timestep, and an is_training flag and returns a
-            tuple of a) distribution inputs (parameters), b) a dist-class to
-            generate an action distribution object from, and c) internal-state
-            outputs (empty list if not applicable). If None, will either use
-            `action_sampler_fn` or compute actions by calling self.model,
-            then sampling from the parameterized action distribution.
-        make_model (Optional[Callable[[Policy, gym.spaces.Space,
-            gym.spaces.Space, TrainerConfigDict], ModelV2]]): Optional callable
-            that takes the same arguments as Policy.__init__ and returns a
-            model instance. The distribution class will be determined
-            automatically. Note: Only one of `make_model` or
-            `make_model_and_action_dist` should be provided. If both are None,
-            a default Model will be created.
-        make_model_and_action_dist (Optional[Callable[[Policy,
-            gym.spaces.Space, gym.spaces.Space, TrainerConfigDict],
-            Tuple[ModelV2, Type[TorchDistributionWrapper]]]]): Optional
-            callable that takes the same arguments as Policy.__init__ and
-            returns a tuple of model instance and torch action distribution
-            class.
-            Note: Only one of `make_model` or `make_model_and_action_dist`
-            should be provided. If both are None, a default Model will be
-            created.
-        apply_gradients_fn (Optional[Callable[[Policy,
-            "torch.optim.Optimizer"], None]]): Optional callable that
-            takes a grads list and applies these to the Model's parameters.
-            If None, will call the `TorchPolicy.apply_gradients()` method
-            instead.
-        mixins (Optional[List[type]]): Optional list of any class mixins for
-            the returned policy class. These mixins will be applied in order
-            and will have higher precedence than the TorchPolicy class.
-        view_requirements_fn (Optional[Callable[[Policy],
-            Dict[str, ViewRequirement]]]): An optional callable to retrieve
-            additional train view requirements for this policy.
-        get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
-            Optional callable that returns the divisibility requirement for
-            sample batches. If None, will assume a value of 1.
-
-    Returns:
-        Type[TorchPolicy]: TorchPolicy child class constructed from the
-            specified args.
-    """
-
-    original_kwargs = locals().copy()
-    base = add_mixins(JAXPolicy, mixins)
-
-    class policy_cls(base):
-        def __init__(self, obs_space, action_space, config):
-            if get_default_config:
-                config = dict(get_default_config(), **config)
-            self.config = config
-
-            if validate_spaces:
-                validate_spaces(self, obs_space, action_space, self.config)
-
-            if before_init:
-                before_init(self, obs_space, action_space, self.config)
-
-            # Model is customized (use default action dist class).
-            if make_model:
-                assert make_model_and_action_dist is None, \
-                    "Either `make_model` or `make_model_and_action_dist`" \
-                    " must be None!"
-                self.model = make_model(self, obs_space, action_space, config)
-                dist_class, _ = ModelCatalog.get_action_dist(
-                    action_space, self.config["model"], framework="torch")
-            # Model and action dist class are customized.
-            elif make_model_and_action_dist:
-                self.model, dist_class = make_model_and_action_dist(
-                    self, obs_space, action_space, config)
-            # Use default model and default action dist.
-            else:
-                dist_class, logit_dim = ModelCatalog.get_action_dist(
-                    action_space, self.config["model"], framework="torch")
-                self.model = ModelCatalog.get_model_v2(
-                    obs_space=obs_space,
-                    action_space=action_space,
-                    num_outputs=logit_dim,
-                    model_config=self.config["model"],
-                    framework="torch")
-
-            # Make sure, we passed in a correct Model factory.
-            assert isinstance(self.model, TorchModelV2), \
-                "ERROR: Generated Model must be a TorchModelV2 object!"
-
-            JAXPolicy.__init__(
-                self,
-                observation_space=obs_space,
-                action_space=action_space,
-                config=config,
-                model=self.model,
-                loss=loss_fn,
-                action_distribution_class=dist_class,
-                action_sampler_fn=action_sampler_fn,
-                action_distribution_fn=action_distribution_fn,
-                max_seq_len=config["model"]["max_seq_len"],
-                get_batch_divisibility_req=get_batch_divisibility_req,
-            )
-
-            # Update this Policy's ViewRequirements (if function given).
-            if callable(view_requirements_fn):
-                self.view_requirements.update(view_requirements_fn(self))
-            # Merge Model's view requirements into Policy's.
-            self.view_requirements.update(
-                self.model.inference_view_requirements)
-
-            _before_loss_init = before_loss_init or after_init
-            if _before_loss_init:
-                _before_loss_init(self, self.observation_space,
-                                  self.action_space, config)
-
-            # Perform test runs through postprocessing- and loss functions.
-            self._initialize_loss_from_dummy_batch(
-                auto_remove_unneeded_view_reqs=True,
-                stats_fn=stats_fn,
-            )
-
-            if _after_loss_init:
-                _after_loss_init(self, obs_space, action_space, config)
-
-            # Got to reset global_timestep again after this fake run-through.
-            self.global_timestep = 0
-
-        @override(Policy)
-        def postprocess_trajectory(self,
-                                   sample_batch,
-                                   other_agent_batches=None,
-                                   episode=None):
-            # Do all post-processing always with no_grad().
-            # Not using this here will introduce a memory leak (issue #6962).
-            with torch.no_grad():
-                # Call super's postprocess_trajectory first.
-                sample_batch = super().postprocess_trajectory(
-                    sample_batch, other_agent_batches, episode)
-                if postprocess_fn:
-                    return postprocess_fn(self, sample_batch,
-                                          other_agent_batches, episode)
-
-                return sample_batch
-
-        @override(JAXPolicy)
-        def extra_grad_process(self, optimizer, loss):
-            """Called after optimizer.zero_grad() and loss.backward() calls.
-
-            Allows for gradient processing before optimizer.step() is called.
-            E.g. for gradient clipping.
-            """
-            if extra_grad_process_fn:
-                return extra_grad_process_fn(self, optimizer, loss)
-            else:
-                return JAXPolicy.extra_grad_process(self, optimizer, loss)
-
-        @override(JAXPolicy)
-        def extra_compute_grad_fetches(self):
-            if extra_learn_fetches_fn:
-                fetches = convert_to_non_torch_type(
-                    extra_learn_fetches_fn(self))
-                # Auto-add empty learner stats dict if needed.
-                return dict({LEARNER_STATS_KEY: {}}, **fetches)
-            else:
-                return JAXPolicy.extra_compute_grad_fetches(self)
-
-        @override(JAXPolicy)
-        def apply_gradients(self, gradients):
-            if apply_gradients_fn:
-                apply_gradients_fn(self, gradients)
-            else:
-                JAXPolicy.apply_gradients(self, gradients)
-
-        @override(JAXPolicy)
-        def extra_action_out(self, input_dict, state_batches, model,
-                             action_dist):
-            with torch.no_grad():
-                if extra_action_out_fn:
-                    stats_dict = extra_action_out_fn(
-                        self, input_dict, state_batches, model, action_dist)
-                else:
-                    stats_dict = JAXPolicy.extra_action_out(
-                        self, input_dict, state_batches, model, action_dist)
-                return convert_to_non_torch_type(stats_dict)
-
-        @override(JAXPolicy)
-        def optimizer(self):
-            if optimizer_fn:
-                optimizers = optimizer_fn(self, self.config)
-            else:
-                optimizers = JAXPolicy.optimizer(self)
-            optimizers = force_list(optimizers)
-            if getattr(self, "exploration", None):
-                optimizers = self.exploration.get_exploration_optimizer(
-                    optimizers)
-            return optimizers
-
-        @override(JAXPolicy)
-        def extra_grad_info(self, train_batch):
-            with torch.no_grad():
-                if stats_fn:
-                    stats_dict = stats_fn(self, train_batch)
-                else:
-                    stats_dict = JAXPolicy.extra_grad_info(self, train_batch)
-                return convert_to_non_torch_type(stats_dict)
-
-    def with_updates(**overrides):
-        """Allows creating a TorchPolicy cls based on settings of another one.
-
-        Keyword Args:
-            **overrides: The settings (passed into `build_torch_policy`) that
-                should be different from the class that this method is called
-                on.
-
-        Returns:
-            type: A new TorchPolicy sub-class.
-
-        Examples:
-        >> MySpecialDQNPolicyClass = DQNTorchPolicy.with_updates(
-        ..    name="MySpecialDQNPolicyClass",
-        ..    loss_function=[some_new_loss_function],
-        .. )
-        """
-        return build_jax_policy_class(**dict(original_kwargs, **overrides))
-
-    policy_cls.with_updates = staticmethod(with_updates)
-    policy_cls.__name__ = name
-    policy_cls.__qualname__ = name
-    return policy_cls
diff --git a/rllib/policy/jax/jax_policy.py b/rllib/policy/jax_policy.py
similarity index 100%
rename from rllib/policy/jax/jax_policy.py
rename to rllib/policy/jax_policy.py
diff --git a/rllib/policy/policy_template.py b/rllib/policy/policy_template.py
index 4d75ff989451..38f278b2ae97 100644
--- a/rllib/policy/policy_template.py
+++ b/rllib/policy/policy_template.py
@@ -6,7 +6,7 @@
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
-from ray.rllib.policy.jax.jax_policy import JAXPolicy
+from ray.rllib.policy.jax_policy import JAXPolicy
 from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.torch_policy import TorchPolicy
diff --git a/rllib/policy/torch_policy_template.py b/rllib/policy/torch_policy_template.py
index 2da390c56e6e..78777f6ba141 100644
--- a/rllib/policy/torch_policy_template.py
+++ b/rllib/policy/torch_policy_template.py
@@ -1,6 +1,7 @@
 import gym
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
+from ray.util import log_once
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
 from ray.rllib.policy.policy import Policy
@@ -69,10 +70,11 @@ def build_torch_policy(
         get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None
 ) -> Type[TorchPolicy]:
 
-    deprecation_warning(
-        old="build_torch_policy",
-        new="build_policy_class(framework='torch')",
-        error=False)
+    if log_once("deprecation_warning_build_torch_policy"):
+        deprecation_warning(
+            old="build_torch_policy",
+            new="build_policy_class(framework='torch')",
+            error=False)
     kwargs = locals().copy()
     # Set to torch and call new function.
     kwargs["framework"] = "torch"
diff --git a/rllib/utils/exploration/curiosity.py b/rllib/utils/exploration/curiosity.py
index a9434e1a1174..ec91c53d39f5 100644
--- a/rllib/utils/exploration/curiosity.py
+++ b/rllib/utils/exploration/curiosity.py
@@ -9,11 +9,12 @@
 from ray.rllib.models.torch.misc import SlimFC
 from ray.rllib.models.torch.torch_action_dist import TorchCategorical, \
     TorchMultiCategorical
+from ray.rllib.models.utils import get_activation_fn
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils import NullContextManager
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.exploration.exploration import Exploration
-from ray.rllib.utils.framework import get_activation_fn, try_import_tf, \
+from ray.rllib.utils.framework import try_import_tf, \
     try_import_torch
 from ray.rllib.utils.from_config import from_config
 from ray.rllib.utils.tf_ops import get_placeholder, one_hot as tf_one_hot
diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py
index b5cda9964622..529ed9d0556e 100644
--- a/rllib/utils/framework.py
+++ b/rllib/utils/framework.py
@@ -4,6 +4,7 @@
 import sys
 from typing import Any, Optional
 
+from ray.rllib.utils.deprecation import deprecation_warning
 from ray.rllib.utils.typing import TensorStructType, TensorShape, TensorType
 
 logger = logging.getLogger(__name__)
@@ -251,7 +252,6 @@ def get_variable(value,
     return value
 
 
-# TODO: (sven) move to models/utils.py
 def get_activation_fn(name: Optional[str] = None, framework: str = "tf"):
     """Returns a framework specific activation function, given a name string.
 
@@ -267,6 +267,9 @@ def get_activation_fn(name: Optional[str] = None, framework: str = "tf"):
     Raises:
         ValueError: If name is an unknown activation function.
     """
+    deprecation_warning("rllib/utils/framework.py::get_activation_fn",
+                        "rllib/models/utils.py::get_activation_fn",
+                        error=False)
     if framework == "torch":
         if name in ["linear", None]:
             return None
@@ -278,16 +281,6 @@ def get_activation_fn(name: Optional[str] = None, framework: str = "tf"):
             return nn.ReLU
         elif name == "tanh":
             return nn.Tanh
-    elif framework == "jax":
-        if name in ["linear", None]:
-            return None
-        jax, _ = try_import_jax()
-        if name == "swish":
-            return jax.nn.swish
-        if name == "relu":
-            return jax.nn.relu
-        elif name == "tanh":
-            return jax.nn.hard_tanh
     else:
         if name in ["linear", None]:
             return None

From d34facbf26fd08f396b338345cae9d7b84bf0f6e Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sun, 20 Dec 2020 09:35:51 -0500
Subject: [PATCH 05/16] WIP.

---
 rllib/models/jax/fcnet.py             | 19 +++++++++++++------
 rllib/models/jax/jax_modelv2.py       | 12 +++++++++---
 rllib/models/jax/misc.py              | 15 ++-------------
 rllib/models/jax/modules/fc_stack.py  | 13 ++++++-------
 rllib/models/tests/test_jax_models.py | 15 +++++++++++++++
 5 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/rllib/models/jax/fcnet.py b/rllib/models/jax/fcnet.py
index a9789d67472c..a35e0f0a3376 100644
--- a/rllib/models/jax/fcnet.py
+++ b/rllib/models/jax/fcnet.py
@@ -1,13 +1,18 @@
 import logging
 import numpy as np
 import time
+from typing import Dict, List, Union
 
 from ray.rllib.models.jax.jax_modelv2 import JAXModelV2
 from ray.rllib.models.jax.modules.fc_stack import FCStack
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_jax
+from ray.rllib.utils.typing import TensorType
 
 jax, flax = try_import_jax()
+jnp = None
+if jax:
+    import jax.numpy as jnp
 
 logger = logging.getLogger(__name__)
 
@@ -31,6 +36,7 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
             raise ValueError("`free_log_std` not supported for JAX yet!")
 
         self._logits = None
+        self._logits_params = None
 
         # The last layer is adjusted to be of size num_outputs, but it's a
         # layer with activation.
@@ -53,10 +59,6 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
                     activation=activation,
                     prng_key=self.prng_key,
                 )
-                #TODO
-                import jax.numpy as jnp
-                in_ = jnp.zeros((10, in_features))
-                vars = self._hidden_layers.init(self.prng_key, in_)
                 prev_layer_size = hiddens[-1]
             if num_outputs:
                 self._logits = FCStack(
@@ -65,10 +67,15 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
                     activation=None,
                     prng_key=self.prng_key,
                 )
+                self._logits_params = self._logits.init(self.prng_key, jnp.zeros((1, prev_layer_size)))
             else:
                 self.num_outputs = (
                     [int(np.product(obs_space.shape))] + hiddens[-1:])[-1]
 
+        # Init hidden layers.
+        in_ = jnp.zeros((1, in_features))
+        self._hidden_layers_params = self._hidden_layers.init(self.prng_key, in_)
+
         self._value_branch_separate = None
         if not self.vf_share_layers:
             # Build a parallel set of hidden layers for the value net.
@@ -91,8 +98,8 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
     @override(JAXModelV2)
     def forward(self, input_dict, state, seq_lens):
         self._last_flat_in = input_dict["obs_flat"]
-        self._features = self._hidden_layers(self._last_flat_in)
-        logits = self._logits(self._features) if self._logits else \
+        self._features = self._hidden_layers.apply(self._hidden_layers_params, self._last_flat_in)
+        logits = self._logits.apply(self._logits_params, self._features) if self._logits else \
             self._features
         return logits, state
 
diff --git a/rllib/models/jax/jax_modelv2.py b/rllib/models/jax/jax_modelv2.py
index 96ef98b818ec..f5c73e512032 100644
--- a/rllib/models/jax/jax_modelv2.py
+++ b/rllib/models/jax/jax_modelv2.py
@@ -9,9 +9,9 @@
 
 
 jax, flax = try_import_jax()
-nn = None
+fd = None
 if flax:
-    import flax.linen as nn
+    from flax.core.frozen_dict import FrozenDict as fd
 
 
 @PublicAPI
@@ -43,7 +43,13 @@ def __init__(self, obs_space: gym.spaces.Space,
     @override(ModelV2)
     def variables(self, as_dict: bool = False
                   ) -> Union[List[TensorType], Dict[str, TensorType]]:
-        return self.variables
+        params = fd({
+            k: v["params"] for k, v in self.__dict__.items() if
+            isinstance(v, fd) and "params" in v
+        })
+        if as_dict:
+            return params
+        return list(params.values())
 
     @PublicAPI
     @override(ModelV2)
diff --git a/rllib/models/jax/misc.py b/rllib/models/jax/misc.py
index 09415f7a0747..cd4b64eac57a 100644
--- a/rllib/models/jax/misc.py
+++ b/rllib/models/jax/misc.py
@@ -22,8 +22,6 @@ class SlimFC(nn.Module if nn else object):
         activation (str): An activation string specifier, e.g. "relu".
         use_bias (bool): Whether to add biases to the dot product or not.
         #bias_init (float):
-        prng_key (Optional[jax.random.PRNGKey]): An optional PRNG key to
-            use for initialization. If None, create a new random one.
     """
 
     in_size: int
@@ -31,34 +29,25 @@ class SlimFC(nn.Module if nn else object):
     initializer: Optional[Union[Callable, str]] = None
     activation: Optional[Union[Callable, str]] = None
     use_bias: bool = True
-    prng_key: Optional[jax.random.PRNGKey] = None
 
     def setup(self):
         # By default, use Glorot unform initializer.
         if self.initializer is None:
             self.initializer = "xavier_uniform"
 
-        if self.prng_key is None:
-            self.prng_key = jax.random.PRNGKey(int(time.time()))
-        #_, self.prng_key = jax.random.split(self.prng_key)
-
         # Activation function (if any; default=None (linear)).
         self.initializer_fn = get_initializer(self.initializer, framework="jax")
         self.activation_fn = get_activation_fn(self.activation, framework="jax")
 
         # Create the flax dense layer.
-        self._dense = nn.Dense(
+        self.dense = nn.Dense(
             self.out_size,
             use_bias=self.use_bias,
             kernel_init=self.initializer_fn,
         )
-        # Initialize it.
-        in_ = jnp.zeros((self.in_size, ))
-        #_, self.prng_key = jax.random.split(self.prng_key)
-        self._params = self._dense.init(self.prng_key, in_)
 
     def __call__(self, x):
-        out = self._dense.apply(self._params, x)
+        out = self.dense(x)
         if self.activation_fn:
             out = self.activation_fn(out)
         return out
diff --git a/rllib/models/jax/modules/fc_stack.py b/rllib/models/jax/modules/fc_stack.py
index 59ca352a732f..d31627d5927c 100644
--- a/rllib/models/jax/modules/fc_stack.py
+++ b/rllib/models/jax/modules/fc_stack.py
@@ -38,23 +38,22 @@ def setup(self):
         self.initializer = get_initializer(self.initializer, framework="jax")
 
         # Create all layers.
-        self._hidden_layers = []
+        hidden_layers = []
         prev_layer_size = self.in_features
         for i, size in enumerate(self.layers):
-            slim_fc = SlimFC(
+            #setattr(self, "fc_{}".format(i), slim_fc)
+            hidden_layers.append(SlimFC(
                 in_size=prev_layer_size,
                 out_size=size,
                 use_bias=self.use_bias,
                 initializer=self.initializer,
                 activation=self.activation,
-                prng_key=self.prng_key,
-            )
-            setattr(self, "fc_{}".format(i), slim_fc)
-            self._hidden_layers.append(slim_fc)
+            ))
             prev_layer_size = size
+        self.hidden_layers = hidden_layers
 
     def __call__(self, inputs):
         x = inputs
-        for layer in self._hidden_layers:
+        for layer in self.hidden_layers:
             x = layer(x)
         return x
diff --git a/rllib/models/tests/test_jax_models.py b/rllib/models/tests/test_jax_models.py
index 255ff4b6493d..478f8cf266cc 100644
--- a/rllib/models/tests/test_jax_models.py
+++ b/rllib/models/tests/test_jax_models.py
@@ -2,6 +2,8 @@
 import unittest
 
 from ray.rllib.models.jax.fcnet import FullyConnectedNetwork
+from ray.rllib.models.jax.misc import SlimFC
+from ray.rllib.models.jax.modules.fc_stack import FCStack
 from ray.rllib.utils.framework import try_import_jax
 
 
@@ -13,6 +15,18 @@
 
 class TestJAXModels(unittest.TestCase):
 
+    def test_jax_slimfc(self):
+        slimfc = SlimFC(5, 2)
+        prng = jax.random.PRNGKey(0)
+        params = slimfc.init(prng, jnp.zeros((1, 5)))
+        assert params
+
+    def test_jax_fcstack(self):
+        fc_stack = FCStack(5, [2, 2], "relu")
+        prng = jax.random.PRNGKey(0)
+        params = fc_stack.init(prng, jnp.zeros((1, 5)))
+        assert params
+
     def test_jax_fcnet(self):
         """Tests the JAX FCNet class."""
         batch_size = 1000
@@ -30,6 +44,7 @@ def test_jax_fcnet(self):
         )
         inputs = jnp.array([obs_space.sample()])
         print(fc_net({"obs": inputs}))
+        fc_net.variables()
 
 
 if __name__ == "__main__":

From f54309c70c2ae784ee90e4486fc4adfbe2ad4b89 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 21 Dec 2020 12:30:30 -0500
Subject: [PATCH 06/16] WIP.

---
 rllib/agents/ppo/ppo.py            |  2 +-
 rllib/agents/ppo/ppo_jax_policy.py | 17 +++----
 rllib/models/jax/fcnet.py          | 13 +++--
 rllib/models/jax/jax_modelv2.py    | 11 ++---
 rllib/policy/jax_policy.py         | 77 +++++-------------------------
 rllib/policy/torch_policy.py       |  2 +-
 rllib/utils/jax_ops.py             | 26 ++++++++++
 7 files changed, 63 insertions(+), 85 deletions(-)

diff --git a/rllib/agents/ppo/ppo.py b/rllib/agents/ppo/ppo.py
index 2ee5b6f9584e..86014d0c6365 100644
--- a/rllib/agents/ppo/ppo.py
+++ b/rllib/agents/ppo/ppo.py
@@ -130,7 +130,7 @@ def validate_config(config: TrainerConfigDict) -> None:
             "trajectory). Consider setting batch_mode=complete_episodes.")
 
     # Multi-gpu not supported for PyTorch and tf-eager.
-    if config["framework"] in ["tf2", "tfe", "torch"]:
+    if config["framework"] != "tf":
         config["simple_optimizer"] = True
     # Performance warning, if "simple" optimizer used with (static-graph) tf.
     elif config["simple_optimizer"]:
diff --git a/rllib/agents/ppo/ppo_jax_policy.py b/rllib/agents/ppo/ppo_jax_policy.py
index 99c928b8e53d..ec3d466c328a 100644
--- a/rllib/agents/ppo/ppo_jax_policy.py
+++ b/rllib/agents/ppo/ppo_jax_policy.py
@@ -139,20 +139,21 @@ def __init__(self, obs_space, action_space, config):
         # observation.
         if config["use_gae"]:
 
-            def value(ob, prev_action, prev_reward, *state):
-                model_out, _ = self.model({
-                    SampleBatch.CUR_OBS: jnp.asarray([ob]),
-                    SampleBatch.PREV_ACTIONS: jnp.asarray([prev_action]),
-                    SampleBatch.PREV_REWARDS: jnp.asarray([prev_reward]),
-                    "is_training": False,
-                }, [jnp.asarray([s]) for s in state], jnp.asarray([1]))
+            # Input dict is provided to us automatically via the Model's
+            # requirements. It's a single-timestep (last one in trajectory)
+            # input_dict.
+            assert config["_use_trajectory_view_api"]
+
+            def value(**input_dict):
+                model_out, _ = self.model.from_batch(
+                    input_dict, is_training=False)
                 # [0] = remove the batch dim.
                 return self.model.value_function()[0]
 
         # When not doing GAE, we do not require the value function's output.
         else:
 
-            def value(ob, prev_action, prev_reward, *state):
+            def value(*args, **kwargs):
                 return 0.0
 
         self._value = value
diff --git a/rllib/models/jax/fcnet.py b/rllib/models/jax/fcnet.py
index a35e0f0a3376..8ddfab5e3627 100644
--- a/rllib/models/jax/fcnet.py
+++ b/rllib/models/jax/fcnet.py
@@ -80,16 +80,21 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
         if not self.vf_share_layers:
             # Build a parallel set of hidden layers for the value net.
             self._value_branch_separate = FCStack(
-                in_features=int(np.product(obs_space.shape)),
+                in_features=in_features,
                 layers=hiddens,
                 activation=activation,
                 prng_key=self.prng_key,
             )
+            in_ = jnp.zeros((1, in_features))
+            self._value_branch_separate_params = self._value_branch_separate.init(self.prng_key, in_)
+
         self._value_branch = FCStack(
             in_features=prev_layer_size,
             layers=[1],
             prng_key=self.prng_key,
         )
+        in_ = jnp.zeros((1, prev_layer_size))
+        self._value_branch_params = self._value_branch.init(self.prng_key, in_)
         # Holds the current "base" output (before logits layer).
         self._features = None
         # Holds the last input, in case value branch is separate.
@@ -107,7 +112,7 @@ def forward(self, input_dict, state, seq_lens):
     def value_function(self):
         assert self._features is not None, "must call forward() first"
         if self._value_branch_separate:
-            x = self._value_branch_separate(self._last_flat_in)
-            return self._value_branch(x).squeeze(1)
+            x = self._value_branch_separate.apply(self._value_branch_separate_params, self._last_flat_in)
+            return self._value_branch.apply(self._value_branch_params, x).squeeze(1)
         else:
-            return self._value_branch(self._features).squeeze(1)
+            return self._value_branch.apply(self._value_branch_params, self._features).squeeze(1)
diff --git a/rllib/models/jax/jax_modelv2.py b/rllib/models/jax/jax_modelv2.py
index f5c73e512032..212ee69677f4 100644
--- a/rllib/models/jax/jax_modelv2.py
+++ b/rllib/models/jax/jax_modelv2.py
@@ -1,5 +1,6 @@
 import gym
 import time
+import tree
 from typing import Dict, List, Union
 
 from ray.rllib.models.modelv2 import ModelV2
@@ -43,17 +44,15 @@ def __init__(self, obs_space: gym.spaces.Space,
     @override(ModelV2)
     def variables(self, as_dict: bool = False
                   ) -> Union[List[TensorType], Dict[str, TensorType]]:
-        params = fd({
-            k: v["params"] for k, v in self.__dict__.items() if
-            isinstance(v, fd) and "params" in v
-        })
+        params = fd({k: v["params"]._dict for k, v in self.__dict__.items() if
+                     isinstance(v, fd) and "params" in v})._dict
         if as_dict:
             return params
-        return list(params.values())
+        return tree.flatten(params)
 
     @PublicAPI
     @override(ModelV2)
     def trainable_variables(
             self, as_dict: bool = False
     ) -> Union[List[TensorType], Dict[str, TensorType]]:
-        return self.variables
+        return self.variables(as_dict=as_dict)
diff --git a/rllib/policy/jax_policy.py b/rllib/policy/jax_policy.py
index fa61cf54e82d..3a8ff512b829 100644
--- a/rllib/policy/jax_policy.py
+++ b/rllib/policy/jax_policy.py
@@ -14,16 +14,18 @@
 from ray.rllib.utils import force_list
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.framework import try_import_jax
+from ray.rllib.utils.jax_ops import convert_to_non_jax_type
 from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
-from ray.rllib.utils.torch_ops import convert_to_non_torch_type, \
-    convert_to_torch_tensor
 from ray.rllib.utils.tracking_dict import UsageTrackingDict
 from ray.rllib.utils.typing import ModelGradients, ModelWeights, \
     TensorType, TrainerConfigDict
 
 jax, flax = try_import_jax()
+jnp = None
+fd = None
 if jax:
     import jax.numpy as jnp
+    from flax.core.frozen_dict import FrozenDict as fd
 
 logger = logging.getLogger(__name__)
 
@@ -221,7 +223,7 @@ def compute_actions_from_input_dict(
         # Update our global timestep by the batch size.
         self.global_timestep += len(input_dict[SampleBatch.CUR_OBS])
 
-        return actions, state_out, extra_fetches
+        return convert_to_non_jax_type((actions, state_out, extra_fetches))
 
     @override(Policy)
     @DeveloperAPI
@@ -384,16 +386,17 @@ def apply_gradients(self, gradients: ModelGradients) -> None:
     @override(Policy)
     @DeveloperAPI
     def get_weights(self) -> ModelWeights:
+        cpu = jax.devices("cpu")[0]
         return {
-            k: v.cpu().detach().numpy()
-            for k, v in self.model.state_dict().items()
+            k: jax.device_put(v, cpu)
+            for k, v in self.model.variables(as_dict=True).items()
         }
 
     @override(Policy)
     @DeveloperAPI
     def set_weights(self, weights: ModelWeights) -> None:
-        weights = convert_to_torch_tensor(weights, device=self.device)
-        self.model.load_state_dict(weights)
+        for k, v in weights.items():
+            setattr(self.model, k, fd({"params": v}))
 
     @override(Policy)
     @DeveloperAPI
@@ -418,7 +421,7 @@ def get_state(self) -> Union[Dict[str, TensorType], List[TensorType]]:
         state = super().get_state()
         state["_optimizer_variables"] = []
         for i, o in enumerate(self._optimizers):
-            optim_state_dict = convert_to_non_torch_type(o.state_dict())
+            optim_state_dict = convert_to_non_jax_type(o.state_dict())
             state["_optimizer_variables"].append(optim_state_dict)
         return state
 
@@ -543,61 +546,5 @@ def _lazy_tensor_dict(self, data):
     #def _lazy_numpy_dict(self, postprocessed_batch):
     #    train_batch = UsageTrackingDict(postprocessed_batch)
     #    train_batch.set_get_interceptor(
-    #        functools.partial(convert_to_non_torch_type))
+    #        functools.partial(convert_to_non_jax_type))
     #    return train_batch
-
-
-# TODO: (sven) Unify hyperparam annealing procedures across RLlib (tf/torch)
-#   and for all possible hyperparams, not just lr.
-@DeveloperAPI
-class LearningRateSchedule:
-    """Mixin for TFPolicy that adds a learning rate schedule."""
-
-    @DeveloperAPI
-    def __init__(self, lr, lr_schedule):
-        self.cur_lr = lr
-        if lr_schedule is None:
-            self.lr_schedule = ConstantSchedule(lr, framework=None)
-        else:
-            self.lr_schedule = PiecewiseSchedule(
-                lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
-
-    @override(Policy)
-    def on_global_var_update(self, global_vars):
-        super().on_global_var_update(global_vars)
-        self.cur_lr = self.lr_schedule.value(global_vars["timestep"])
-        for opt in self._optimizers:
-            for p in opt.param_groups:
-                p["lr"] = self.cur_lr
-
-
-@DeveloperAPI
-class EntropyCoeffSchedule:
-    """Mixin for TorchPolicy that adds entropy coeff decay."""
-
-    @DeveloperAPI
-    def __init__(self, entropy_coeff, entropy_coeff_schedule):
-        self.entropy_coeff = entropy_coeff
-
-        if entropy_coeff_schedule is None:
-            self.entropy_coeff_schedule = ConstantSchedule(
-                entropy_coeff, framework=None)
-        else:
-            # Allows for custom schedule similar to lr_schedule format
-            if isinstance(entropy_coeff_schedule, list):
-                self.entropy_coeff_schedule = PiecewiseSchedule(
-                    entropy_coeff_schedule,
-                    outside_value=entropy_coeff_schedule[-1][-1],
-                    framework=None)
-            else:
-                # Implements previous version but enforces outside_value
-                self.entropy_coeff_schedule = PiecewiseSchedule(
-                    [[0, entropy_coeff], [entropy_coeff_schedule, 0.0]],
-                    outside_value=0.0,
-                    framework=None)
-
-    @override(Policy)
-    def on_global_var_update(self, global_vars):
-        super(EntropyCoeffSchedule, self).on_global_var_update(global_vars)
-        self.entropy_coeff = self.entropy_coeff_schedule.value(
-            global_vars["timestep"])
diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py
index c27a7603dff2..28618618b4bb 100644
--- a/rllib/policy/torch_policy.py
+++ b/rllib/policy/torch_policy.py
@@ -604,7 +604,7 @@ def _lazy_numpy_dict(self, postprocessed_batch):
 #   and for all possible hyperparams, not just lr.
 @DeveloperAPI
 class LearningRateSchedule:
-    """Mixin for TFPolicy that adds a learning rate schedule."""
+    """Mixin for TorchPolicy that adds a learning rate schedule."""
 
     @DeveloperAPI
     def __init__(self, lr, lr_schedule):
diff --git a/rllib/utils/jax_ops.py b/rllib/utils/jax_ops.py
index 04a0a1581a7d..d775d520f5b4 100644
--- a/rllib/utils/jax_ops.py
+++ b/rllib/utils/jax_ops.py
@@ -1,3 +1,6 @@
+import numpy as np
+import tree
+
 from ray.rllib.utils.framework import try_import_jax
 
 jax, _ = try_import_jax()
@@ -6,6 +9,29 @@
     import jax.numpy as jnp
 
 
+def convert_to_non_jax_type(stats):
+    """Converts values in `stats` to non-JAX numpy or python types.
+
+    Args:
+        stats (any): Any (possibly nested) struct, the values in which will be
+            converted and returned as a new struct with all JAX DeviceArrays
+            being converted to numpy types.
+
+    Returns:
+        Any: A new struct with the same structure as `stats`, but with all
+            values converted to non-JAX Tensor types.
+    """
+
+    # The mapping function used to numpyize JAX DeviceArrays.
+    def mapping(item):
+        if isinstance(item, jnp.DeviceArray):
+            return np.array(item)
+        else:
+            return item
+
+    return tree.map_structure(mapping, stats)
+
+
 def explained_variance(y, pred):
     y_var = jnp.var(y, axis=[0])
     diff_var = jnp.var(y - pred, axis=[0])

From 48d619d58b306bda2812dccd240a3c92b4a9517b Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 21 Dec 2020 17:25:49 -0500
Subject: [PATCH 07/16] WIP.

---
 rllib/agents/ppo/ppo_jax_policy.py |  4 +-
 rllib/policy/jax_policy.py         | 73 ++++++++++++------------------
 2 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/rllib/agents/ppo/ppo_jax_policy.py b/rllib/agents/ppo/ppo_jax_policy.py
index ec3d466c328a..8f8f6f8ee767 100644
--- a/rllib/agents/ppo/ppo_jax_policy.py
+++ b/rllib/agents/ppo/ppo_jax_policy.py
@@ -35,7 +35,9 @@
 def ppo_surrogate_loss(
         policy: Policy, model: ModelV2,
         dist_class: Type[TorchDistributionWrapper],
-        train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]:
+        train_batch: SampleBatch,
+        vars=None,#TODO: test
+) -> Union[TensorType, List[TensorType]]:
     """Constructs the loss for Proximal Policy Objective.
 
     Args:
diff --git a/rllib/policy/jax_policy.py b/rllib/policy/jax_policy.py
index 3a8ff512b829..ced959a8a27d 100644
--- a/rllib/policy/jax_policy.py
+++ b/rllib/policy/jax_policy.py
@@ -112,6 +112,7 @@ def __init__(
         self.exploration = self._create_exploration()
         self.unwrapped_model = model  # used to support DistributedDataParallel
         self._loss = loss
+        self._gradient_loss = jax.grad(self._loss, argnums=4)
         self._optimizers = force_list(self.optimizer())
 
         self.dist_class = action_distribution_class
@@ -288,7 +289,7 @@ def learn_on_batch(
 
         # Step the optimizer(s).
         for i, opt in enumerate(self._optimizers):
-            opt.step()
+            opt.apply_gradients(grads)
 
         if self.model:
             fetches["model"] = self.model.metrics()
@@ -309,20 +310,20 @@ def compute_gradients(self,
         train_batch = self._lazy_tensor_dict(postprocessed_batch)
 
         # Calculate the actual policy loss.
-        loss_out = force_list(
-            self._loss(self, self.model, self.dist_class, train_batch))
+        all_grads = force_list(
+            self._gradient_loss(self, self.model, self.dist_class, train_batch, self.model.variables()))
 
         # Call Model's custom-loss with Policy loss outputs and train_batch.
-        if self.model:
-            loss_out = self.model.custom_loss(loss_out, train_batch)
+        #if self.model:
+        #    loss_out = self.model.custom_loss(loss_out, train_batch)
 
         # Give Exploration component that chance to modify the loss (or add
         # its own terms).
-        if hasattr(self, "exploration"):
-            loss_out = self.exploration.get_exploration_loss(
-                loss_out, train_batch)
+        #if hasattr(self, "exploration"):
+        #    loss_out = self.exploration.get_exploration_loss(
+        #        loss_out, train_batch)
 
-        assert len(loss_out) == len(self._optimizers)
+        #assert len(loss_out) == len(self._optimizers)
 
         # assert not any(torch.isnan(l) for l in loss_out)
         fetches = self.extra_compute_grad_fetches()
@@ -330,42 +331,24 @@ def compute_gradients(self,
         # Loop through all optimizers.
         grad_info = {"allreduce_latency": 0.0}
 
-        all_grads = []
-        for i, opt in enumerate(self._optimizers):
-            # Erase gradients in all vars of this optimizer.
-            opt.zero_grad()
-            # Recompute gradients of loss over all variables.
-            loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1))
-            grad_info.update(self.extra_grad_process(opt, loss_out[i]))
-
-            grads = []
-            # Note that return values are just references;
-            # Calling zero_grad would modify the values.
-            for param_group in opt.param_groups:
-                for p in param_group["params"]:
-                    if p.grad is not None:
-                        grads.append(p.grad)
-                        all_grads.append(p.grad.data.cpu().numpy())
-                    else:
-                        all_grads.append(None)
-
-            if self.distributed_world_size:
-                start = time.time()
-                if torch.cuda.is_available():
-                    # Sadly, allreduce_coalesced does not work with CUDA yet.
-                    for g in grads:
-                        torch.distributed.all_reduce(
-                            g, op=torch.distributed.ReduceOp.SUM)
-                else:
-                    torch.distributed.all_reduce_coalesced(
-                        grads, op=torch.distributed.ReduceOp.SUM)
-
-                for param_group in opt.param_groups:
-                    for p in param_group["params"]:
-                        if p.grad is not None:
-                            p.grad /= self.distributed_world_size
-
-                grad_info["allreduce_latency"] += time.time() - start
+        #all_grads = []
+        #for i, opt in enumerate(self._optimizers):
+        #    # Erase gradients in all vars of this optimizer.
+        #    opt.zero_grad()
+        #    # Recompute gradients of loss over all variables.
+        #    loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1))
+        #    grad_info.update(self.extra_grad_process(opt, loss_out[i]))
+
+        #    grads = []
+        #    # Note that return values are just references;
+        #    # Calling zero_grad would modify the values.
+        #    for param_group in opt.param_groups:
+        #        for p in param_group["params"]:
+        #            if p.grad is not None:
+        #                grads.append(p.grad)
+        #                all_grads.append(p.grad.data.cpu().numpy())
+        #            else:
+        #                all_grads.append(None)
 
         grad_info["allreduce_latency"] /= len(self._optimizers)
         grad_info.update(self.extra_grad_info(train_batch))

From 9b3187251f7e8b94c17b3bad9611007f3762b43b Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 22 Dec 2020 06:37:26 -0500
Subject: [PATCH 08/16] WIP.

---
 rllib/policy/jax_policy.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rllib/policy/jax_policy.py b/rllib/policy/jax_policy.py
index ced959a8a27d..633a4c5095d2 100644
--- a/rllib/policy/jax_policy.py
+++ b/rllib/policy/jax_policy.py
@@ -498,9 +498,10 @@ def optimizer(
                 The local FLAX optimizer(s) to use for this Policy.
         """
         if hasattr(self, "config"):
-            return flax.optim.Adam(learning_rate=self.config["lr"])
+            return flax.optim.Optimizer(flax.optim.Adam(
+                learning_rate=self.config["lr"]))
         else:
-            return flax.optim.Adam()
+            return flax.optim.Optimizer(flax.optim.Adam())
 
     @override(Policy)
     @DeveloperAPI

From 548b6b5b4692a940c2415f281c0f01eb550c7540 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 24 Dec 2020 14:51:59 -0500
Subject: [PATCH 09/16] Fix and LINT.

---
 rllib/BUILD                                   |   7 ++
 rllib/agents/ppo/ppo_jax_policy.py            |  18 ++--
 rllib/agents/ppo/ppo_torch_policy.py          |   4 +-
 rllib/agents/ppo/tests/test_ppo.py            |  21 ++--
 rllib/agents/slateq/slateq_torch_policy.py    |   1 -
 rllib/examples/custom_torch_policy.py         |   4 +-
 rllib/models/jax/fcnet.py                     |  28 ++---
 rllib/models/jax/jax_modelv2.py               |   8 +-
 rllib/models/jax/misc.py                      |   8 +-
 rllib/models/jax/modules/fc_stack.py          |  22 ++--
 rllib/models/tests/test_jax_models.py         |   6 +-
 rllib/policy/jax_policy.py                    | 102 +++++++-----------
 rllib/policy/policy_template.py               |   1 -
 rllib/policy/torch_policy.py                  |   8 +-
 .../utils/exploration/stochastic_sampling.py  |   4 +-
 rllib/utils/framework.py                      |   7 +-
 rllib/utils/jax_ops.py                        |   3 +-
 17 files changed, 121 insertions(+), 131 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index c645c27a0aec..bf74635f5329 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1089,6 +1089,13 @@ py_test(
     srcs = ["models/tests/test_distributions.py"]
 )
 
+py_test(
+    name = "test_jax_models",
+    tags = ["models"],
+    size = "small",
+    srcs = ["models/tests/test_jax_models.py"]
+)
+
 # --------------------------------------------------------------------
 # Evaluation components
 # rllib/evaluation/
diff --git a/rllib/agents/ppo/ppo_jax_policy.py b/rllib/agents/ppo/ppo_jax_policy.py
index 8f8f6f8ee767..c9eee64cfacc 100644
--- a/rllib/agents/ppo/ppo_jax_policy.py
+++ b/rllib/agents/ppo/ppo_jax_policy.py
@@ -3,7 +3,6 @@
 """
 import gym
 import logging
-import numpy as np
 from typing import List, Type, Union
 
 import ray
@@ -33,10 +32,11 @@
 
 
 def ppo_surrogate_loss(
-        policy: Policy, model: ModelV2,
+        policy: Policy,
+        model: ModelV2,
         dist_class: Type[TorchDistributionWrapper],
         train_batch: SampleBatch,
-        vars=None,#TODO: test
+        vars=None,
 ) -> Union[TensorType, List[TensorType]]:
     """Constructs the loss for Proximal Policy Objective.
 
@@ -93,13 +93,13 @@ def reduce_mean_valid(t):
     if policy.config["use_gae"]:
         prev_value_fn_out = train_batch[SampleBatch.VF_PREDS]
         value_fn_out = model.value_function()
-        vf_loss1 = jnp.square(
-            value_fn_out - train_batch[Postprocessing.VALUE_TARGETS])
+        vf_loss1 = jnp.square(value_fn_out -
+                              train_batch[Postprocessing.VALUE_TARGETS])
         vf_clipped = prev_value_fn_out + jnp.clip(
             value_fn_out - prev_value_fn_out, -policy.config["vf_clip_param"],
             policy.config["vf_clip_param"])
-        vf_loss2 = jnp.square(
-            vf_clipped - train_batch[Postprocessing.VALUE_TARGETS])
+        vf_loss2 = jnp.square(vf_clipped -
+                              train_batch[Postprocessing.VALUE_TARGETS])
         vf_loss = jnp.maximum(vf_loss1, vf_loss2)
         mean_vf_loss = reduce_mean_valid(vf_loss)
         total_loss = reduce_mean_valid(
@@ -117,8 +117,8 @@ def reduce_mean_valid(t):
     policy._mean_policy_loss = mean_policy_loss
     policy._mean_vf_loss = mean_vf_loss
     policy._vf_explained_var = explained_variance(
-            train_batch[Postprocessing.VALUE_TARGETS],
-            policy.model.value_function())
+        train_batch[Postprocessing.VALUE_TARGETS],
+        policy.model.value_function())
     policy._mean_entropy = mean_entropy
     policy._mean_kl = mean_kl
 
diff --git a/rllib/agents/ppo/ppo_torch_policy.py b/rllib/agents/ppo/ppo_torch_policy.py
index 96d924a3d5d9..1763435417f7 100644
--- a/rllib/agents/ppo/ppo_torch_policy.py
+++ b/rllib/agents/ppo/ppo_torch_policy.py
@@ -111,8 +111,8 @@ def reduce_mean_valid(t):
     policy._mean_policy_loss = mean_policy_loss
     policy._mean_vf_loss = mean_vf_loss
     policy._vf_explained_var = explained_variance(
-            train_batch[Postprocessing.VALUE_TARGETS],
-            policy.model.value_function())
+        train_batch[Postprocessing.VALUE_TARGETS],
+        policy.model.value_function())
     policy._mean_entropy = mean_entropy
     policy._mean_kl = mean_kl
 
diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
index a4e849f07186..1820a2b8fc9e 100644
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@@ -44,6 +44,12 @@ def _check_lr_torch(policy, policy_id):
             for p in opt.param_groups:
                 assert p["lr"] == policy.cur_lr, "LR scheduling error!"
 
+    @staticmethod
+    def _check_lr_jax(policy, policy_id):
+        for j, opt in enumerate(policy._optimizers):
+            assert opt.optimizer_def.hyper_params.learning_rate == \
+                   policy.cur_lr, "LR scheduling error!"
+
     @staticmethod
     def _check_lr_tf(policy, policy_id):
         lr = policy.cur_lr
@@ -57,14 +63,16 @@ def _check_lr_tf(policy, policy_id):
         assert lr == optim_lr, "LR scheduling error!"
 
     def on_train_result(self, *, trainer, result: dict, **kwargs):
-        trainer.workers.foreach_policy(self._check_lr_torch if trainer.config[
-            "framework"] == "torch" else self._check_lr_tf)
+        fw = trainer.config["framework"]
+        trainer.workers.foreach_policy(
+            self._check_lr_tf if fw.startswith("tf") else self._check_lr_torch
+            if fw == "torch" else self._check_lr_jax)
 
 
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init(local_mode=True)#TODO
+        ray.init()
 
     @classmethod
     def tearDownClass(cls):
@@ -84,10 +92,11 @@ def test_ppo_compilation_and_lr_schedule(self):
         config["train_batch_size"] = 128
         num_iterations = 2
 
-        for _ in framework_iterator(config, frameworks="jax"):#TODO
-            for env in ["CartPole-v0"]:#, "MsPacmanNoFrameskip-v4"]:
+        for _ in framework_iterator(
+                config, frameworks=("tf2", "tf", "torch", "jax")):
+            for env in ["CartPole-v0", "MsPacmanNoFrameskip-v4"]:
                 print("Env={}".format(env))
-                for lstm in [False]:#True, False]:
+                for lstm in [True, False]:
                     print("LSTM={}".format(lstm))
                     config["model"]["use_lstm"] = lstm
                     config["model"]["lstm_use_prev_action"] = lstm
diff --git a/rllib/agents/slateq/slateq_torch_policy.py b/rllib/agents/slateq/slateq_torch_policy.py
index d6bb0af67983..19638d65767a 100644
--- a/rllib/agents/slateq/slateq_torch_policy.py
+++ b/rllib/agents/slateq/slateq_torch_policy.py
@@ -406,7 +406,6 @@ def postprocess_fn_add_next_actions_for_sarsa(policy: Policy,
 SlateQTorchPolicy = build_policy_class(
     name="SlateQTorchPolicy",
     framework="torch",
-
     get_default_config=lambda: ray.rllib.agents.slateq.slateq.DEFAULT_CONFIG,
 
     # build model, loss functions, and optimizers
diff --git a/rllib/examples/custom_torch_policy.py b/rllib/examples/custom_torch_policy.py
index 7e2937c11060..3c88e89f1646 100644
--- a/rllib/examples/custom_torch_policy.py
+++ b/rllib/examples/custom_torch_policy.py
@@ -21,9 +21,7 @@ def policy_gradient_loss(policy, model, dist_class, train_batch):
 
 # <class 'ray.rllib.policy.torch_policy_template.MyTorchPolicy'>
 MyTorchPolicy = build_policy_class(
-    name="MyTorchPolicy",
-    framework="torch",
-    loss_fn=policy_gradient_loss)
+    name="MyTorchPolicy", framework="torch", loss_fn=policy_gradient_loss)
 
 # <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
 MyTrainer = build_trainer(
diff --git a/rllib/models/jax/fcnet.py b/rllib/models/jax/fcnet.py
index 8ddfab5e3627..a0f748283d2d 100644
--- a/rllib/models/jax/fcnet.py
+++ b/rllib/models/jax/fcnet.py
@@ -1,13 +1,10 @@
 import logging
 import numpy as np
-import time
-from typing import Dict, List, Union
 
 from ray.rllib.models.jax.jax_modelv2 import JAXModelV2
 from ray.rllib.models.jax.modules.fc_stack import FCStack
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_jax
-from ray.rllib.utils.typing import TensorType
 
 jax, flax = try_import_jax()
 jnp = None
@@ -67,14 +64,16 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
                     activation=None,
                     prng_key=self.prng_key,
                 )
-                self._logits_params = self._logits.init(self.prng_key, jnp.zeros((1, prev_layer_size)))
+                self._logits_params = self._logits.init(
+                    self.prng_key, jnp.zeros((1, prev_layer_size)))
             else:
                 self.num_outputs = (
                     [int(np.product(obs_space.shape))] + hiddens[-1:])[-1]
 
         # Init hidden layers.
         in_ = jnp.zeros((1, in_features))
-        self._hidden_layers_params = self._hidden_layers.init(self.prng_key, in_)
+        self._hidden_layers_params = self._hidden_layers.init(
+            self.prng_key, in_)
 
         self._value_branch_separate = None
         if not self.vf_share_layers:
@@ -86,7 +85,8 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
                 prng_key=self.prng_key,
             )
             in_ = jnp.zeros((1, in_features))
-            self._value_branch_separate_params = self._value_branch_separate.init(self.prng_key, in_)
+            self._value_branch_separate_params = \
+                self._value_branch_separate.init(self.prng_key, in_)
 
         self._value_branch = FCStack(
             in_features=prev_layer_size,
@@ -103,16 +103,20 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
     @override(JAXModelV2)
     def forward(self, input_dict, state, seq_lens):
         self._last_flat_in = input_dict["obs_flat"]
-        self._features = self._hidden_layers.apply(self._hidden_layers_params, self._last_flat_in)
-        logits = self._logits.apply(self._logits_params, self._features) if self._logits else \
-            self._features
+        self._features = self._hidden_layers.apply(self._hidden_layers_params,
+                                                   self._last_flat_in)
+        logits = self._logits.apply(self._logits_params, self._features) if \
+            self._logits else self._features
         return logits, state
 
     @override(JAXModelV2)
     def value_function(self):
         assert self._features is not None, "must call forward() first"
         if self._value_branch_separate:
-            x = self._value_branch_separate.apply(self._value_branch_separate_params, self._last_flat_in)
-            return self._value_branch.apply(self._value_branch_params, x).squeeze(1)
+            x = self._value_branch_separate.apply(
+                self._value_branch_separate_params, self._last_flat_in)
+            return self._value_branch.apply(self._value_branch_params,
+                                            x).squeeze(1)
         else:
-            return self._value_branch.apply(self._value_branch_params, self._features).squeeze(1)
+            return self._value_branch.apply(self._value_branch_params,
+                                            self._features).squeeze(1)
diff --git a/rllib/models/jax/jax_modelv2.py b/rllib/models/jax/jax_modelv2.py
index 212ee69677f4..6a8828f76f87 100644
--- a/rllib/models/jax/jax_modelv2.py
+++ b/rllib/models/jax/jax_modelv2.py
@@ -8,7 +8,6 @@
 from ray.rllib.utils.framework import try_import_jax
 from ray.rllib.utils.typing import ModelConfigDict, TensorType
 
-
 jax, flax = try_import_jax()
 fd = None
 if flax:
@@ -44,8 +43,11 @@ def __init__(self, obs_space: gym.spaces.Space,
     @override(ModelV2)
     def variables(self, as_dict: bool = False
                   ) -> Union[List[TensorType], Dict[str, TensorType]]:
-        params = fd({k: v["params"]._dict for k, v in self.__dict__.items() if
-                     isinstance(v, fd) and "params" in v})._dict
+        params = fd({
+            k: v["params"]._dict
+            for k, v in self.__dict__.items()
+            if isinstance(v, fd) and "params" in v
+        })._dict
         if as_dict:
             return params
         return tree.flatten(params)
diff --git a/rllib/models/jax/misc.py b/rllib/models/jax/misc.py
index cd4b64eac57a..b69ac50cfad1 100644
--- a/rllib/models/jax/misc.py
+++ b/rllib/models/jax/misc.py
@@ -1,4 +1,3 @@
-import time
 from typing import Callable, Optional, Union
 
 from ray.rllib.models.utils import get_activation_fn, get_initializer
@@ -8,7 +7,6 @@
 nn = jnp = None
 if flax:
     import flax.linen as nn
-    import jax.numpy as jnp
 
 
 class SlimFC(nn.Module if nn else object):
@@ -36,8 +34,10 @@ def setup(self):
             self.initializer = "xavier_uniform"
 
         # Activation function (if any; default=None (linear)).
-        self.initializer_fn = get_initializer(self.initializer, framework="jax")
-        self.activation_fn = get_activation_fn(self.activation, framework="jax")
+        self.initializer_fn = get_initializer(
+            self.initializer, framework="jax")
+        self.activation_fn = get_activation_fn(
+            self.activation, framework="jax")
 
         # Create the flax dense layer.
         self.dense = nn.Dense(
diff --git a/rllib/models/jax/modules/fc_stack.py b/rllib/models/jax/modules/fc_stack.py
index d31627d5927c..de858d67bda2 100644
--- a/rllib/models/jax/modules/fc_stack.py
+++ b/rllib/models/jax/modules/fc_stack.py
@@ -1,10 +1,9 @@
 import logging
-import numpy as np
 import time
 from typing import Callable, Optional, Union
 
 from ray.rllib.models.jax.misc import SlimFC
-from ray.rllib.models.utils import get_activation_fn, get_initializer
+from ray.rllib.models.utils import get_initializer
 from ray.rllib.utils.framework import try_import_jax
 
 jax, flax = try_import_jax()
@@ -32,7 +31,8 @@ class FCStack(nn.Module if nn else object):
     activation: Optional[Union[Callable, str]] = None
     initializer: Optional[Union[Callable, str]] = None
     use_bias: bool = True
-    prng_key: Optional[jax.random.PRNGKey] = jax.random.PRNGKey(int(time.time()))
+    prng_key: Optional[jax.random.PRNGKey] = jax.random.PRNGKey(
+        int(time.time()))
 
     def setup(self):
         self.initializer = get_initializer(self.initializer, framework="jax")
@@ -41,14 +41,14 @@ def setup(self):
         hidden_layers = []
         prev_layer_size = self.in_features
         for i, size in enumerate(self.layers):
-            #setattr(self, "fc_{}".format(i), slim_fc)
-            hidden_layers.append(SlimFC(
-                in_size=prev_layer_size,
-                out_size=size,
-                use_bias=self.use_bias,
-                initializer=self.initializer,
-                activation=self.activation,
-            ))
+            hidden_layers.append(
+                SlimFC(
+                    in_size=prev_layer_size,
+                    out_size=size,
+                    use_bias=self.use_bias,
+                    initializer=self.initializer,
+                    activation=self.activation,
+                ))
             prev_layer_size = size
         self.hidden_layers = hidden_layers
 
diff --git a/rllib/models/tests/test_jax_models.py b/rllib/models/tests/test_jax_models.py
index 478f8cf266cc..5376ca94c63f 100644
--- a/rllib/models/tests/test_jax_models.py
+++ b/rllib/models/tests/test_jax_models.py
@@ -6,7 +6,6 @@
 from ray.rllib.models.jax.modules.fc_stack import FCStack
 from ray.rllib.utils.framework import try_import_jax
 
-
 jax, flax = try_import_jax()
 jnp = None
 if jax:
@@ -14,7 +13,6 @@
 
 
 class TestJAXModels(unittest.TestCase):
-
     def test_jax_slimfc(self):
         slimfc = SlimFC(5, 2)
         prng = jax.random.PRNGKey(0)
@@ -29,7 +27,6 @@ def test_jax_fcstack(self):
 
     def test_jax_fcnet(self):
         """Tests the JAX FCNet class."""
-        batch_size = 1000
         obs_space = Box(-10.0, 10.0, shape=(4, ))
         action_space = Discrete(2)
         fc_net = FullyConnectedNetwork(
@@ -40,8 +37,7 @@ def test_jax_fcnet(self):
                 "fcnet_hiddens": [10],
                 "fcnet_activation": "relu",
             },
-            name="jax_model"
-        )
+            name="jax_model")
         inputs = jnp.array([obs_space.sample()])
         print(fc_net({"obs": inputs}))
         fc_net.variables()
diff --git a/rllib/policy/jax_policy.py b/rllib/policy/jax_policy.py
index 633a4c5095d2..60269d4e72d8 100644
--- a/rllib/policy/jax_policy.py
+++ b/rllib/policy/jax_policy.py
@@ -2,7 +2,6 @@
 import gym
 import numpy as np
 import logging
-import time
 from typing import Callable, Dict, List, Optional, Tuple, Type, Union
 
 from ray.rllib.models.modelv2 import ModelV2
@@ -15,7 +14,6 @@
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.framework import try_import_jax
 from ray.rllib.utils.jax_ops import convert_to_non_jax_type
-from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
 from ray.rllib.utils.tracking_dict import UsageTrackingDict
 from ray.rllib.utils.typing import ModelGradients, ModelWeights, \
     TensorType, TrainerConfigDict
@@ -97,13 +95,7 @@ def __init__(
         """
         self.framework = "jax"
         super().__init__(observation_space, action_space, config)
-        #if torch.cuda.is_available():
-        #    logger.info("TorchPolicy running on GPU.")
-        #    self.device = torch.device("cuda")
-        #else:
-        #    logger.info("TorchPolicy running on CPU.")
-        #    self.device = torch.device("cpu")
-        self.model = model#.to(self.device)
+        self.model = model
         # Auto-update model's inference view requirements, if recurrent.
         self._update_model_inference_view_requirements_from_init_state()
         # Combine view_requirements for Model and Policy.
@@ -127,6 +119,7 @@ def __init__(
             callable(get_batch_divisibility_req) else \
             (get_batch_divisibility_req or 1)
 
+    @override(Policy)
     def compute_actions(
             self,
             obs_batch: Union[List[TensorType], TensorType],
@@ -139,7 +132,22 @@ def compute_actions(
             timestep: Optional[int] = None,
             **kwargs) -> \
             Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
-        raise NotImplementedError
+
+        input_dict = self._lazy_tensor_dict({
+            SampleBatch.CUR_OBS: np.asarray(obs_batch),
+            "is_training": False,
+        })
+        if prev_action_batch is not None:
+            input_dict[SampleBatch.PREV_ACTIONS] = \
+                np.asarray(prev_action_batch)
+        if prev_reward_batch is not None:
+            input_dict[SampleBatch.PREV_REWARDS] = \
+                np.asarray(prev_reward_batch)
+        for i, s in enumerate(state_batches or []):
+            input_dict["state_in_".format(i)] = s
+
+        return self.compute_actions_from_input_dict(input_dict, explore,
+                                                    timestep, **kwargs)
 
     @override(Policy)
     def compute_actions_from_input_dict(
@@ -268,8 +276,7 @@ def compute_log_likelihoods(
         # Default action-dist inputs calculation.
         else:
             dist_class = self.dist_class
-            dist_inputs, _ = self.model(input_dict, state_batches,
-                                        seq_lens)
+            dist_inputs, _ = self.model(input_dict, state_batches, seq_lens)
 
         action_dist = dist_class(dist_inputs, self.model)
         log_likelihoods = action_dist.logp(input_dict[SampleBatch.ACTIONS])
@@ -288,8 +295,9 @@ def learn_on_batch(
         grads, fetches = self.compute_gradients(postprocessed_batch)
 
         # Step the optimizer(s).
-        for i, opt in enumerate(self._optimizers):
-            opt.apply_gradients(grads)
+        for i in range(len(self._optimizers)):
+            opt = self._optimizers[i]
+            self._optimizers[i] = opt.apply_gradient(grads[i])
 
         if self.model:
             fetches["model"] = self.model.metrics()
@@ -311,19 +319,12 @@ def compute_gradients(self,
 
         # Calculate the actual policy loss.
         all_grads = force_list(
-            self._gradient_loss(self, self.model, self.dist_class, train_batch, self.model.variables()))
-
-        # Call Model's custom-loss with Policy loss outputs and train_batch.
-        #if self.model:
-        #    loss_out = self.model.custom_loss(loss_out, train_batch)
-
-        # Give Exploration component that chance to modify the loss (or add
-        # its own terms).
-        #if hasattr(self, "exploration"):
-        #    loss_out = self.exploration.get_exploration_loss(
-        #        loss_out, train_batch)
-
-        #assert len(loss_out) == len(self._optimizers)
+            self._gradient_loss(
+                self,
+                self.model,
+                self.dist_class,
+                train_batch,
+                self.model.variables(as_dict=True)))
 
         # assert not any(torch.isnan(l) for l in loss_out)
         fetches = self.extra_compute_grad_fetches()
@@ -331,25 +332,6 @@ def compute_gradients(self,
         # Loop through all optimizers.
         grad_info = {"allreduce_latency": 0.0}
 
-        #all_grads = []
-        #for i, opt in enumerate(self._optimizers):
-        #    # Erase gradients in all vars of this optimizer.
-        #    opt.zero_grad()
-        #    # Recompute gradients of loss over all variables.
-        #    loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1))
-        #    grad_info.update(self.extra_grad_process(opt, loss_out[i]))
-
-        #    grads = []
-        #    # Note that return values are just references;
-        #    # Calling zero_grad would modify the values.
-        #    for param_group in opt.param_groups:
-        #        for p in param_group["params"]:
-        #            if p.grad is not None:
-        #                grads.append(p.grad)
-        #                all_grads.append(p.grad.data.cpu().numpy())
-        #            else:
-        #                all_grads.append(None)
-
         grad_info["allreduce_latency"] /= len(self._optimizers)
         grad_info.update(self.extra_grad_info(train_batch))
 
@@ -360,11 +342,9 @@ def compute_gradients(self,
     def apply_gradients(self, gradients: ModelGradients) -> None:
         # TODO(sven): Not supported for multiple optimizers yet.
         assert len(self._optimizers) == 1
-        for g, p in zip(gradients, self.model.parameters()):
-            if g is not None:
-                p.grad = torch.from_numpy(g).to(self.device)
 
-        self._optimizers[0].step()
+        # Step the optimizer(s).
+        self._optimizers[0] = self._optimizers[0].apply_gradient(gradients)
 
     @override(Policy)
     @DeveloperAPI
@@ -416,15 +396,13 @@ def set_state(self, state: object) -> None:
         optimizer_vars = state.pop("_optimizer_variables", None)
         if optimizer_vars:
             assert len(optimizer_vars) == len(self._optimizers)
-            for o, s in zip(self._optimizers, optimizer_vars):
-                optim_state_dict = convert_to_torch_tensor(
-                    s, device=self.device)
-                o.load_state_dict(optim_state_dict)
+            for i, (o, s) in enumerate(zip(self._optimizers, optimizer_vars)):
+                self._optimizers[i].optimizer_def.state = s
         # Then the Policy's (NN) weights.
         super().set_state(state)
 
     @DeveloperAPI
-    def extra_grad_process(self, optimizer: "torch.optim.Optimizer",
+    def extra_grad_process(self, optimizer: "jax.optim.Optimizer",
                            loss: TensorType):
         """Called after each optimizer.zero_grad() + loss.backward() call.
 
@@ -498,10 +476,12 @@ def optimizer(
                 The local FLAX optimizer(s) to use for this Policy.
         """
         if hasattr(self, "config"):
-            return flax.optim.Optimizer(flax.optim.Adam(
-                learning_rate=self.config["lr"]))
+            adam = flax.optim.Adam(learning_rate=self.config["lr"])
         else:
-            return flax.optim.Optimizer(flax.optim.Adam())
+            adam = flax.optim.Adam()
+        weights = self.get_weights()
+        adam_state = adam.init_state(weights)
+        return flax.optim.Optimizer(adam, adam_state, target=weights)
 
     @override(Policy)
     @DeveloperAPI
@@ -526,9 +506,3 @@ def import_model_from_h5(self, import_file: str) -> None:
     def _lazy_tensor_dict(self, data):
         tensor_dict = UsageTrackingDict(data)
         return tensor_dict
-
-    #def _lazy_numpy_dict(self, postprocessed_batch):
-    #    train_batch = UsageTrackingDict(postprocessed_batch)
-    #    train_batch.set_get_interceptor(
-    #        functools.partial(convert_to_non_jax_type))
-    #    return train_batch
diff --git a/rllib/policy/policy_template.py b/rllib/policy/policy_template.py
index 38f278b2ae97..6f7b48cdb29c 100644
--- a/rllib/policy/policy_template.py
+++ b/rllib/policy/policy_template.py
@@ -398,7 +398,6 @@ def with_updates(**overrides):
         """
         return build_policy_class(**dict(original_kwargs, **overrides))
 
-
     policy_cls.with_updates = staticmethod(with_updates)
     policy_cls.__name__ = name
     policy_cls.__qualname__ = name
diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py
index 28618618b4bb..95c9b5c4a2c0 100644
--- a/rllib/policy/torch_policy.py
+++ b/rllib/policy/torch_policy.py
@@ -619,9 +619,11 @@ def __init__(self, lr, lr_schedule):
     def on_global_var_update(self, global_vars):
         super().on_global_var_update(global_vars)
         self.cur_lr = self.lr_schedule.value(global_vars["timestep"])
-        for opt in self._optimizers:
-            for p in opt.param_groups:
-                p["lr"] = self.cur_lr
+        for i in range(len(self._optimizers)):
+            opt = self._optimizers[i]
+            new_hyperparams = opt.optimizer_def.update_hyper_params(
+                learning_rate=self.cur_lr)
+            opt.optimizer_def.hyper_params = new_hyperparams
 
 
 @DeveloperAPI
diff --git a/rllib/utils/exploration/stochastic_sampling.py b/rllib/utils/exploration/stochastic_sampling.py
index c3f735231ed8..7875c8c2f62f 100644
--- a/rllib/utils/exploration/stochastic_sampling.py
+++ b/rllib/utils/exploration/stochastic_sampling.py
@@ -67,8 +67,8 @@ def get_exploration_action(self,
                                timestep: Union[int, TensorType],
                                explore: bool = True):
         if self.framework in ["torch", "jax"]:
-            return self._get_exploration_action(action_distribution,
-                                                timestep, explore)
+            return self._get_exploration_action(action_distribution, timestep,
+                                                explore)
         else:
             return self._get_tf_exploration_action_op(action_distribution,
                                                       timestep, explore)
diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py
index 362b7400cfed..ca1dece44bff 100644
--- a/rllib/utils/framework.py
+++ b/rllib/utils/framework.py
@@ -268,9 +268,10 @@ def get_activation_fn(name: Optional[str] = None, framework: str = "tf"):
     Raises:
         ValueError: If name is an unknown activation function.
     """
-    deprecation_warning("rllib/utils/framework.py::get_activation_fn",
-                        "rllib/models/utils.py::get_activation_fn",
-                        error=False)
+    deprecation_warning(
+        "rllib/utils/framework.py::get_activation_fn",
+        "rllib/models/utils.py::get_activation_fn",
+        error=False)
     if framework == "torch":
         if name in ["linear", None]:
             return None
diff --git a/rllib/utils/jax_ops.py b/rllib/utils/jax_ops.py
index d775d520f5b4..f37589956c4d 100644
--- a/rllib/utils/jax_ops.py
+++ b/rllib/utils/jax_ops.py
@@ -49,8 +49,7 @@ def sequence_mask(lengths, maxlen=None, dtype=None, time_major=False):
     if maxlen is None:
         maxlen = int(lengths.max())
 
-    mask = ~(jnp.ones(
-        (len(lengths), maxlen)).cumsum(axis=1).t() > lengths)
+    mask = ~(jnp.ones((len(lengths), maxlen)).cumsum(axis=1).t() > lengths)
     if not time_major:
         mask = mask.t()
     mask.type(dtype or jnp.bool_)

From 46f483a320a32aa720ccc40d561f7fc62735327a Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 24 Dec 2020 14:58:52 -0500
Subject: [PATCH 10/16] Fix and LINT.

---
 rllib/agents/ppo/ppo_jax_policy.py |  4 ++--
 rllib/agents/ppo/tests/test_ppo.py | 12 +++++++++---
 rllib/policy/jax_policy.py         | 27 +++++++++++++++++++++++++++
 rllib/policy/torch_policy.py       | 10 ++++------
 4 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/rllib/agents/ppo/ppo_jax_policy.py b/rllib/agents/ppo/ppo_jax_policy.py
index c9eee64cfacc..d30b95b8dfe7 100644
--- a/rllib/agents/ppo/ppo_jax_policy.py
+++ b/rllib/agents/ppo/ppo_jax_policy.py
@@ -14,11 +14,11 @@
 from ray.rllib.evaluation.postprocessing import Postprocessing
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
+from ray.rllib.policy.jax_policy import LearningRateSchedule
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy import EntropyCoeffSchedule, \
-    LearningRateSchedule
+from ray.rllib.policy.torch_policy import EntropyCoeffSchedule
 from ray.rllib.utils.framework import try_import_jax
 from ray.rllib.utils.jax_ops import explained_variance, sequence_mask
 from ray.rllib.utils.typing import TensorType, TrainerConfigDict
diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
index 1820a2b8fc9e..8c51a3b87b1c 100644
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@@ -92,11 +92,17 @@ def test_ppo_compilation_and_lr_schedule(self):
         config["train_batch_size"] = 128
         num_iterations = 2
 
-        for _ in framework_iterator(
+        for fw in framework_iterator(
                 config, frameworks=("tf2", "tf", "torch", "jax")):
-            for env in ["CartPole-v0", "MsPacmanNoFrameskip-v4"]:
+            envs = ["CartPole-v0"]
+            if fw != "jax":
+                envs.append("MsPacmanNoFrameskip-v4")
+            for env in envs:
                 print("Env={}".format(env))
-                for lstm in [True, False]:
+                lstms = [False]
+                if fw != "jax":
+                    lstms.append(True)
+                for lstm in lstms:
                     print("LSTM={}".format(lstm))
                     config["model"]["use_lstm"] = lstm
                     config["model"]["lstm_use_prev_action"] = lstm
diff --git a/rllib/policy/jax_policy.py b/rllib/policy/jax_policy.py
index 60269d4e72d8..aea6a54ebbbd 100644
--- a/rllib/policy/jax_policy.py
+++ b/rllib/policy/jax_policy.py
@@ -14,6 +14,7 @@
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.framework import try_import_jax
 from ray.rllib.utils.jax_ops import convert_to_non_jax_type
+from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
 from ray.rllib.utils.tracking_dict import UsageTrackingDict
 from ray.rllib.utils.typing import ModelGradients, ModelWeights, \
     TensorType, TrainerConfigDict
@@ -506,3 +507,29 @@ def import_model_from_h5(self, import_file: str) -> None:
     def _lazy_tensor_dict(self, data):
         tensor_dict = UsageTrackingDict(data)
         return tensor_dict
+
+
+# TODO: (sven) Unify hyperparam annealing procedures across RLlib (tf/torch)
+#   and for all possible hyperparams, not just lr.
+@DeveloperAPI
+class LearningRateSchedule:
+    """Mixin for TorchPolicy that adds a learning rate schedule."""
+
+    @DeveloperAPI
+    def __init__(self, lr, lr_schedule):
+        self.cur_lr = lr
+        if lr_schedule is None:
+            self.lr_schedule = ConstantSchedule(lr, framework=None)
+        else:
+            self.lr_schedule = PiecewiseSchedule(
+                lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
+
+    @override(Policy)
+    def on_global_var_update(self, global_vars):
+        super().on_global_var_update(global_vars)
+        self.cur_lr = self.lr_schedule.value(global_vars["timestep"])
+        for i in range(len(self._optimizers)):
+            opt = self._optimizers[i]
+            new_hyperparams = opt.optimizer_def.update_hyper_params(
+                learning_rate=self.cur_lr)
+            opt.optimizer_def.hyper_params = new_hyperparams
diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py
index dadc90d9b340..10e875d50adb 100644
--- a/rllib/policy/torch_policy.py
+++ b/rllib/policy/torch_policy.py
@@ -617,7 +617,7 @@ def _lazy_numpy_dict(self, postprocessed_batch):
 #   and for all possible hyperparams, not just lr.
 @DeveloperAPI
 class LearningRateSchedule:
-    """Mixin for TorchPolicy that adds a learning rate schedule."""
+    """Mixin for TFPolicy that adds a learning rate schedule."""
 
     @DeveloperAPI
     def __init__(self, lr, lr_schedule):
@@ -632,11 +632,9 @@ def __init__(self, lr, lr_schedule):
     def on_global_var_update(self, global_vars):
         super().on_global_var_update(global_vars)
         self.cur_lr = self.lr_schedule.value(global_vars["timestep"])
-        for i in range(len(self._optimizers)):
-            opt = self._optimizers[i]
-            new_hyperparams = opt.optimizer_def.update_hyper_params(
-                learning_rate=self.cur_lr)
-            opt.optimizer_def.hyper_params = new_hyperparams
+        for opt in self._optimizers:
+            for p in opt.param_groups:
+                p["lr"] = self.cur_lr
 
 
 @DeveloperAPI

From b254335ac8af8e3c5b083829092bf3262c9f1717 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sun, 27 Dec 2020 09:46:39 -0500
Subject: [PATCH 11/16] wip

---
 rllib/agents/ppo/tests/test_ppo.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
index 8c51a3b87b1c..c552c34d1b90 100644
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@@ -72,7 +72,7 @@ def on_train_result(self, *, trainer, result: dict, **kwargs):
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init()
+        ray.init(local_mode=True)#TODO
 
     @classmethod
     def tearDownClass(cls):
@@ -90,10 +90,10 @@ def test_ppo_compilation_and_lr_schedule(self):
         config["model"]["lstm_cell_size"] = 10
         config["model"]["max_seq_len"] = 20
         config["train_batch_size"] = 128
-        num_iterations = 2
+        num_iterations = 12#TODO: = 2
 
         for fw in framework_iterator(
-                config, frameworks=("tf2", "tf", "torch", "jax")):
+                config, frameworks=("jax", "tf2", "tf", "torch")):
             envs = ["CartPole-v0"]
             if fw != "jax":
                 envs.append("MsPacmanNoFrameskip-v4")
@@ -109,7 +109,7 @@ def test_ppo_compilation_and_lr_schedule(self):
                     config["model"]["lstm_use_prev_reward"] = lstm
                     trainer = ppo.PPOTrainer(config=config, env=env)
                     for i in range(num_iterations):
-                        trainer.train()
+                        print(trainer.train())#TODO: no print
                     check_compute_single_action(
                         trainer,
                         include_prev_action_reward=True,

From 58ef15dd54032788a1965f28f44708a41776f3c4 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sun, 27 Dec 2020 10:11:20 -0500
Subject: [PATCH 12/16] wip

---
 rllib/agents/ppo/tests/test_ppo.py |  6 +++---
 rllib/models/jax/fcnet.py          | 16 ++++++++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
index c552c34d1b90..83ba13e880ce 100644
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@@ -72,7 +72,7 @@ def on_train_result(self, *, trainer, result: dict, **kwargs):
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init(local_mode=True)#TODO
+        ray.init(local_mode=True)  #TODO
 
     @classmethod
     def tearDownClass(cls):
@@ -90,7 +90,7 @@ def test_ppo_compilation_and_lr_schedule(self):
         config["model"]["lstm_cell_size"] = 10
         config["model"]["max_seq_len"] = 20
         config["train_batch_size"] = 128
-        num_iterations = 12#TODO: = 2
+        num_iterations = 12  #TODO: = 2
 
         for fw in framework_iterator(
                 config, frameworks=("jax", "tf2", "tf", "torch")):
@@ -109,7 +109,7 @@ def test_ppo_compilation_and_lr_schedule(self):
                     config["model"]["lstm_use_prev_reward"] = lstm
                     trainer = ppo.PPOTrainer(config=config, env=env)
                     for i in range(num_iterations):
-                        print(trainer.train())#TODO: no print
+                        print(trainer.train())  #TODO: no print
                     check_compute_single_action(
                         trainer,
                         include_prev_action_reward=True,
diff --git a/rllib/models/jax/fcnet.py b/rllib/models/jax/fcnet.py
index a0f748283d2d..51bf662c0c1f 100644
--- a/rllib/models/jax/fcnet.py
+++ b/rllib/models/jax/fcnet.py
@@ -34,6 +34,7 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
 
         self._logits = None
         self._logits_params = None
+        self._hidden_layers = None
 
         # The last layer is adjusted to be of size num_outputs, but it's a
         # layer with activation.
@@ -71,9 +72,11 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
                     [int(np.product(obs_space.shape))] + hiddens[-1:])[-1]
 
         # Init hidden layers.
-        in_ = jnp.zeros((1, in_features))
-        self._hidden_layers_params = self._hidden_layers.init(
-            self.prng_key, in_)
+        self._hidden_layers_params = None
+        if self._hidden_layers:
+            in_ = jnp.zeros((1, in_features))
+            self._hidden_layers_params = self._hidden_layers.init(
+                self.prng_key, in_)
 
         self._value_branch_separate = None
         if not self.vf_share_layers:
@@ -102,9 +105,10 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
 
     @override(JAXModelV2)
     def forward(self, input_dict, state, seq_lens):
-        self._last_flat_in = input_dict["obs_flat"]
-        self._features = self._hidden_layers.apply(self._hidden_layers_params,
-                                                   self._last_flat_in)
+        self._last_flat_in = self._features = input_dict["obs_flat"]
+        if self._hidden_layers:
+            self._features = self._hidden_layers.apply(
+                self._hidden_layers_params, self._last_flat_in)
         logits = self._logits.apply(self._logits_params, self._features) if \
             self._logits else self._features
         return logits, state

From 683c1fc4ffcdf444caaba2e64fdc5d0457d4498a Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 28 Dec 2020 08:52:27 -0500
Subject: [PATCH 13/16] Fixes and LINT.

---
 rllib/BUILD                         | 10 ++++++
 rllib/agents/ppo/ppo_jax_policy.py  | 52 ++++++++++++-----------------
 rllib/agents/ppo/tests/test_ppo.py  |  6 ++--
 rllib/models/jax/jax_modelv2.py     |  6 ++--
 rllib/policy/jax_policy.py          | 17 +++++-----
 rllib/tests/run_regression_tests.py | 19 ++++++++---
 rllib/utils/deprecation.py          |  4 +--
 7 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 7fb98b7c9641..533bf2a7dac2 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -270,6 +270,16 @@ py_test(
 )
 
 # PPO
+py_test(
+    name = "run_regression_tests_cartpole_ppo_jax",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_torch", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/ppo/cartpole-ppo.yaml"],
+    args = ["--yaml-dir=tuned_examples/ppo", "--framework=jax"]
+)
+
 py_test(
     name = "run_regression_tests_cartpole_ppo_tf",
     main = "tests/run_regression_tests.py",
diff --git a/rllib/agents/ppo/ppo_jax_policy.py b/rllib/agents/ppo/ppo_jax_policy.py
index d30b95b8dfe7..c97c5b59d3dd 100644
--- a/rllib/agents/ppo/ppo_jax_policy.py
+++ b/rllib/agents/ppo/ppo_jax_policy.py
@@ -20,7 +20,7 @@
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.torch_policy import EntropyCoeffSchedule
 from ray.rllib.utils.framework import try_import_jax
-from ray.rllib.utils.jax_ops import explained_variance, sequence_mask
+from ray.rllib.utils.jax_ops import explained_variance
 from ray.rllib.utils.typing import TensorType, TrainerConfigDict
 
 jax, flax = try_import_jax()
@@ -50,27 +50,13 @@ def ppo_surrogate_loss(
         Union[TensorType, List[TensorType]]: A single loss tensor or a list
             of loss tensors.
     """
+    if vars:
+        for k, v in vars.items():
+            setattr(model, k, v)
+
     logits, state = model.from_batch(train_batch, is_training=True)
     curr_action_dist = dist_class(logits, model)
 
-    # RNN case: Mask away 0-padded chunks at end of time axis.
-    if state:
-        max_seq_len = jnp.maximum(train_batch["seq_lens"])
-        mask = sequence_mask(
-            train_batch["seq_lens"],
-            max_seq_len,
-            time_major=model.is_time_major())
-        mask = jnp.reshape(mask, [-1])
-        num_valid = jnp.sum(mask)
-
-        def reduce_mean_valid(t):
-            return jnp.sum(t[mask]) / num_valid
-
-    # non-RNN case: No masking.
-    else:
-        mask = None
-        reduce_mean_valid = jnp.mean
-
     prev_action_dist = dist_class(train_batch[SampleBatch.ACTION_DIST_INPUTS],
                                   model)
 
@@ -78,17 +64,17 @@ def reduce_mean_valid(t):
         curr_action_dist.logp(train_batch[SampleBatch.ACTIONS]) -
         train_batch[SampleBatch.ACTION_LOGP])
     action_kl = prev_action_dist.kl(curr_action_dist)
-    mean_kl = reduce_mean_valid(action_kl)
+    mean_kl = jnp.mean(action_kl)
 
     curr_entropy = curr_action_dist.entropy()
-    mean_entropy = reduce_mean_valid(curr_entropy)
+    mean_entropy = jnp.mean(curr_entropy)
 
     surrogate_loss = jnp.minimum(
         train_batch[Postprocessing.ADVANTAGES] * logp_ratio,
         train_batch[Postprocessing.ADVANTAGES] * jnp.clip(
             logp_ratio, 1 - policy.config["clip_param"],
             1 + policy.config["clip_param"]))
-    mean_policy_loss = reduce_mean_valid(-surrogate_loss)
+    mean_policy_loss = jnp.mean(-surrogate_loss)
 
     if policy.config["use_gae"]:
         prev_value_fn_out = train_batch[SampleBatch.VF_PREDS]
@@ -101,16 +87,14 @@ def reduce_mean_valid(t):
         vf_loss2 = jnp.square(vf_clipped -
                               train_batch[Postprocessing.VALUE_TARGETS])
         vf_loss = jnp.maximum(vf_loss1, vf_loss2)
-        mean_vf_loss = reduce_mean_valid(vf_loss)
-        total_loss = reduce_mean_valid(
-            -surrogate_loss + policy.kl_coeff * action_kl +
-            policy.config["vf_loss_coeff"] * vf_loss -
-            policy.entropy_coeff * curr_entropy)
+        mean_vf_loss = jnp.mean(vf_loss)
+        total_loss = jnp.mean(-surrogate_loss + policy.kl_coeff * action_kl +
+                              policy.config["vf_loss_coeff"] * vf_loss -
+                              policy.entropy_coeff * curr_entropy)
     else:
         mean_vf_loss = 0.0
-        total_loss = reduce_mean_valid(-surrogate_loss +
-                                       policy.kl_coeff * action_kl -
-                                       policy.entropy_coeff * curr_entropy)
+        total_loss = jnp.mean(-surrogate_loss + policy.kl_coeff * action_kl -
+                              policy.entropy_coeff * curr_entropy)
 
     # Store stats in policy for stats_fn.
     policy._total_loss = total_loss
@@ -122,6 +106,14 @@ def reduce_mean_valid(t):
     policy._mean_entropy = mean_entropy
     policy._mean_kl = mean_kl
 
+    if vars:
+        policy._total_loss = policy._total_loss.primal
+        policy._mean_policy_loss = policy._mean_policy_loss.primal
+        policy._mean_vf_loss = policy._mean_vf_loss.primal
+        policy._vf_explained_var = policy._vf_explained_var.primal
+        policy._mean_entropy = policy._mean_entropy.primal
+        policy._mean_kl = policy._mean_kl.primal
+
     return total_loss
 
 
diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
index 83ba13e880ce..62afb6337289 100644
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@@ -72,7 +72,7 @@ def on_train_result(self, *, trainer, result: dict, **kwargs):
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init(local_mode=True)  #TODO
+        ray.init()
 
     @classmethod
     def tearDownClass(cls):
@@ -90,7 +90,7 @@ def test_ppo_compilation_and_lr_schedule(self):
         config["model"]["lstm_cell_size"] = 10
         config["model"]["max_seq_len"] = 20
         config["train_batch_size"] = 128
-        num_iterations = 12  #TODO: = 2
+        num_iterations = 2
 
         for fw in framework_iterator(
                 config, frameworks=("jax", "tf2", "tf", "torch")):
@@ -109,7 +109,7 @@ def test_ppo_compilation_and_lr_schedule(self):
                     config["model"]["lstm_use_prev_reward"] = lstm
                     trainer = ppo.PPOTrainer(config=config, env=env)
                     for i in range(num_iterations):
-                        print(trainer.train())  #TODO: no print
+                        trainer.train()
                     check_compute_single_action(
                         trainer,
                         include_prev_action_reward=True,
diff --git a/rllib/models/jax/jax_modelv2.py b/rllib/models/jax/jax_modelv2.py
index 6a8828f76f87..e081a9b55b36 100644
--- a/rllib/models/jax/jax_modelv2.py
+++ b/rllib/models/jax/jax_modelv2.py
@@ -43,11 +43,11 @@ def __init__(self, obs_space: gym.spaces.Space,
     @override(ModelV2)
     def variables(self, as_dict: bool = False
                   ) -> Union[List[TensorType], Dict[str, TensorType]]:
-        params = fd({
-            k: v["params"]._dict
+        params = {
+            k: v
             for k, v in self.__dict__.items()
             if isinstance(v, fd) and "params" in v
-        })._dict
+        }
         if as_dict:
             return params
         return tree.flatten(params)
diff --git a/rllib/policy/jax_policy.py b/rllib/policy/jax_policy.py
index aea6a54ebbbd..195b6c74ae03 100644
--- a/rllib/policy/jax_policy.py
+++ b/rllib/policy/jax_policy.py
@@ -21,10 +21,8 @@
 
 jax, flax = try_import_jax()
 jnp = None
-fd = None
 if jax:
     import jax.numpy as jnp
-    from flax.core.frozen_dict import FrozenDict as fd
 
 logger = logging.getLogger(__name__)
 
@@ -300,6 +298,10 @@ def learn_on_batch(
             opt = self._optimizers[i]
             self._optimizers[i] = opt.apply_gradient(grads[i])
 
+        # Update model's params.
+        for k, v in self._optimizers[0].target.items():
+            setattr(self.model, k, v)
+
         if self.model:
             fetches["model"] = self.model.metrics()
         return fetches
@@ -317,15 +319,12 @@ def compute_gradients(self,
         )
 
         train_batch = self._lazy_tensor_dict(postprocessed_batch)
+        model_params = self.model.variables(as_dict=True)
 
         # Calculate the actual policy loss.
         all_grads = force_list(
-            self._gradient_loss(
-                self,
-                self.model,
-                self.dist_class,
-                train_batch,
-                self.model.variables(as_dict=True)))
+            self._gradient_loss(self, self.model, self.dist_class, train_batch,
+                                model_params))
 
         # assert not any(torch.isnan(l) for l in loss_out)
         fetches = self.extra_compute_grad_fetches()
@@ -360,7 +359,7 @@ def get_weights(self) -> ModelWeights:
     @DeveloperAPI
     def set_weights(self, weights: ModelWeights) -> None:
         for k, v in weights.items():
-            setattr(self.model, k, fd({"params": v}))
+            setattr(self.model, k, v)
 
     @override(Policy)
     @DeveloperAPI
diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py
index 9a2c3313779b..3f42147e4071 100644
--- a/rllib/tests/run_regression_tests.py
+++ b/rllib/tests/run_regression_tests.py
@@ -25,17 +25,25 @@
 import ray
 from ray.tune import run_experiments
 from ray.rllib import _register_all
+from ray.rllib.utils.deprecation import deprecation_warning
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
-    "--torch",
-    action="store_true",
-    help="Runs all tests with PyTorch enabled.")
+    "--framework",
+    choices=["jax", "tf2", "tf", "tfe", "torch"],
+    default="tf",
+    help="The deep learning framework to use.")
 parser.add_argument(
     "--yaml-dir",
     type=str,
     help="The directory in which to find all yamls to test.")
 
+# Obsoleted arg, use --framework=torch instead.
+parser.add_argument(
+    "--torch",
+    action="store_true",
+    help="Runs all tests with PyTorch enabled.")
+
 if __name__ == "__main__":
     args = parser.parse_args()
 
@@ -69,8 +77,11 @@
 
         # Add torch option to exp configs.
         for exp in experiments.values():
+            exp["config"]["framework"] = args.framework
             if args.torch:
+                deprecation_warning(old="--torch", new="--framework=torch")
                 exp["config"]["framework"] = "torch"
+                args.framework = "torch"
 
         # Print out the actual config.
         print("== Test config ==")
@@ -82,7 +93,7 @@
         for i in range(3):
             try:
                 ray.init(num_cpus=5)
-                trials = run_experiments(experiments, resume=False, verbose=1)
+                trials = run_experiments(experiments, resume=False, verbose=2)
             finally:
                 ray.shutdown()
                 _register_all()
diff --git a/rllib/utils/deprecation.py b/rllib/utils/deprecation.py
index 8f3828b6a15b..05788059bed1 100644
--- a/rllib/utils/deprecation.py
+++ b/rllib/utils/deprecation.py
@@ -15,8 +15,8 @@ def deprecation_warning(old, new=None, error=None):
     Args:
         old (str): A description of the "thing" that is to be deprecated.
         new (Optional[str]): A description of the new "thing" that replaces it.
-        error (Optional[bool,Exception]): Whether or which exception to throw.
-            If True, throw ValueError.
+        error (Optional[Union[bool,Exception]]): Whether or which exception to
+            throw. If True, throw ValueError.
     """
     msg = "`{}` has been deprecated.{}".format(
         old, (" Use `{}` instead.".format(new) if new else ""))

From 01072ed60f2ef667b128abebd54859fd2bb98fa9 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 31 Dec 2020 11:20:46 -0500
Subject: [PATCH 14/16] WIP.

---
 rllib/agents/ppo/tests/test_ppo.py | 2 +-
 rllib/policy/jax_policy.py         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
index 62afb6337289..9902e4e94d25 100644
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@@ -109,7 +109,7 @@ def test_ppo_compilation_and_lr_schedule(self):
                     config["model"]["lstm_use_prev_reward"] = lstm
                     trainer = ppo.PPOTrainer(config=config, env=env)
                     for i in range(num_iterations):
-                        trainer.train()
+                        print(trainer.train())
                     check_compute_single_action(
                         trainer,
                         include_prev_action_reward=True,
diff --git a/rllib/policy/jax_policy.py b/rllib/policy/jax_policy.py
index 195b6c74ae03..251f1a0a5771 100644
--- a/rllib/policy/jax_policy.py
+++ b/rllib/policy/jax_policy.py
@@ -96,9 +96,9 @@ def __init__(
         super().__init__(observation_space, action_space, config)
         self.model = model
         # Auto-update model's inference view requirements, if recurrent.
-        self._update_model_inference_view_requirements_from_init_state()
+        self._update_model_view_requirements_from_init_state()
         # Combine view_requirements for Model and Policy.
-        self.view_requirements.update(self.model.inference_view_requirements)
+        self.view_requirements.update(self.model.view_requirements)
 
         self.exploration = self._create_exploration()
         self.unwrapped_model = model  # used to support DistributedDataParallel

From 2528f4720e2ec0bfbc35e10f9e35053e1107815e Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 1 Jan 2021 19:46:06 -0500
Subject: [PATCH 15/16] wip.

---
 rllib/agents/ppo/tests/test_ppo.py | 14 ++---
 rllib/models/jax/fcnet.py          | 46 ++++++++++-----
 rllib/policy/jax_policy.py         | 92 +++++++++++++++--------------
 rllib/policy/torch_policy.py       | 93 ++++++++++++++++--------------
 4 files changed, 138 insertions(+), 107 deletions(-)

diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
index 9902e4e94d25..998096202dd1 100644
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@@ -72,7 +72,7 @@ def on_train_result(self, *, trainer, result: dict, **kwargs):
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init()
+        ray.init(local_mode=True)
 
     @classmethod
     def tearDownClass(cls):
@@ -90,18 +90,18 @@ def test_ppo_compilation_and_lr_schedule(self):
         config["model"]["lstm_cell_size"] = 10
         config["model"]["max_seq_len"] = 20
         config["train_batch_size"] = 128
-        num_iterations = 2
+        num_iterations = 1#TODO:2
 
         for fw in framework_iterator(
-                config, frameworks=("jax", "tf2", "tf", "torch")):
+                config, frameworks=("jax", "torch")):#TODO, "tf2", "tf", "torch")):
             envs = ["CartPole-v0"]
-            if fw != "jax":
-                envs.append("MsPacmanNoFrameskip-v4")
+            #if fw != "jax":
+            #    envs.append("MsPacmanNoFrameskip-v4")
             for env in envs:
                 print("Env={}".format(env))
                 lstms = [False]
-                if fw != "jax":
-                    lstms.append(True)
+                #if fw != "jax":
+                #    lstms.append(True)
                 for lstm in lstms:
                     print("LSTM={}".format(lstm))
                     config["model"]["use_lstm"] = lstm
diff --git a/rllib/models/jax/fcnet.py b/rllib/models/jax/fcnet.py
index 51bf662c0c1f..4a6eab7f8f9e 100644
--- a/rllib/models/jax/fcnet.py
+++ b/rllib/models/jax/fcnet.py
@@ -105,22 +105,38 @@ def __init__(self, obs_space, action_space, num_outputs, model_config,
 
     @override(JAXModelV2)
     def forward(self, input_dict, state, seq_lens):
-        self._last_flat_in = self._features = input_dict["obs_flat"]
-        if self._hidden_layers:
-            self._features = self._hidden_layers.apply(
-                self._hidden_layers_params, self._last_flat_in)
-        logits = self._logits.apply(self._logits_params, self._features) if \
-            self._logits else self._features
-        return logits, state
+        #self.jit_forward = jax.jit(lambda i, s, sl: self.forward_(i, s, sl))
+
+        if not hasattr(self, "forward_"):
+            def forward_(flat_in):
+                self._last_flat_in = self._features = flat_in
+                if self._hidden_layers:
+                    self._features = self._hidden_layers.apply(
+                        self._hidden_layers_params, self._last_flat_in)
+                logits = self._logits.apply(self._logits_params, self._features) if \
+                    self._logits else self._features
+                return logits, state
+
+            self.forward_ = jax.jit(forward_)
+
+        return self.forward_(input_dict["obs_flat"])
 
     @override(JAXModelV2)
     def value_function(self):
         assert self._features is not None, "must call forward() first"
-        if self._value_branch_separate:
-            x = self._value_branch_separate.apply(
-                self._value_branch_separate_params, self._last_flat_in)
-            return self._value_branch.apply(self._value_branch_params,
-                                            x).squeeze(1)
-        else:
-            return self._value_branch.apply(self._value_branch_params,
-                                            self._features).squeeze(1)
+
+        if not hasattr(self, "value_function_"):
+            def value_function_():
+                if self._value_branch_separate:
+                    x = self._value_branch_separate.apply(
+                        self._value_branch_separate_params, self._last_flat_in)
+                    return self._value_branch.apply(self._value_branch_params,
+                                                    x).squeeze(1)
+                else:
+                    return self._value_branch.apply(self._value_branch_params,
+                                                    self._features).squeeze(1)
+
+            self.value_function_ = jax.jit(value_function_)
+
+        return self.value_function_()
+        
\ No newline at end of file
diff --git a/rllib/policy/jax_policy.py b/rllib/policy/jax_policy.py
index 251f1a0a5771..103827f24912 100644
--- a/rllib/policy/jax_policy.py
+++ b/rllib/policy/jax_policy.py
@@ -157,6 +157,9 @@ def compute_actions_from_input_dict(
             **kwargs) -> \
             Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
 
+        import time#TODO
+        start = time.time()
+
         explore = explore if explore is not None else self.config["explore"]
         timestep = timestep if timestep is not None else self.global_timestep
 
@@ -168,49 +171,49 @@ def compute_actions_from_input_dict(
         seq_lens = np.array([1] * len(input_dict["obs"])) \
             if state_batches else None
 
-        if self.action_sampler_fn:
-            action_dist = dist_inputs = None
-            state_out = state_batches
-            actions, logp, state_out = self.action_sampler_fn(
-                self,
-                self.model,
-                input_dict,
-                state_out,
-                explore=explore,
-                timestep=timestep)
-        else:
-            # Call the exploration before_compute_actions hook.
-            self.exploration.before_compute_actions(
-                explore=explore, timestep=timestep)
-            if self.action_distribution_fn:
-                dist_inputs, dist_class, state_out = \
-                    self.action_distribution_fn(
-                        self,
-                        self.model,
-                        input_dict[SampleBatch.CUR_OBS],
-                        explore=explore,
-                        timestep=timestep,
-                        is_training=False)
-            else:
-                dist_class = self.dist_class
-                dist_inputs, state_out = self.model(input_dict, state_batches,
-                                                    seq_lens)
-
-            if not (isinstance(dist_class, functools.partial)
-                    or issubclass(dist_class, JAXDistribution)):
-                raise ValueError(
-                    "`dist_class` ({}) not a JAXDistribution "
-                    "subclass! Make sure your `action_distribution_fn` or "
-                    "`make_model_and_action_dist` return a correct "
-                    "distribution class.".format(dist_class.__name__))
-            action_dist = dist_class(dist_inputs, self.model)
-
-            # Get the exploration action from the forward results.
-            actions, logp = \
-                self.exploration.get_exploration_action(
-                    action_distribution=action_dist,
-                    timestep=timestep,
-                    explore=explore)
+        #if self.action_sampler_fn:
+        #    action_dist = dist_inputs = None
+        #    state_out = state_batches
+        #    actions, logp, state_out = self.action_sampler_fn(
+        #        self,
+        #        self.model,
+        #        input_dict,
+        #        state_out,
+        #        explore=explore,
+        #        timestep=timestep)
+        #else:
+        # Call the exploration before_compute_actions hook.
+        #self.exploration.before_compute_actions(
+        #    explore=explore, timestep=timestep)
+        #if self.action_distribution_fn:
+        #    dist_inputs, dist_class, state_out = \
+        #        self.action_distribution_fn(
+        #            self,
+        #            self.model,
+        #            input_dict[SampleBatch.CUR_OBS],
+        #            explore=explore,
+        #            timestep=timestep,
+        #            is_training=False)
+        #else:
+        dist_class = self.dist_class
+        dist_inputs, state_out = self.model(input_dict, state_batches,
+                                            seq_lens)
+
+        #if not (isinstance(dist_class, functools.partial)
+        #        or issubclass(dist_class, JAXDistribution)):
+        #    raise ValueError(
+        #        "`dist_class` ({}) not a JAXDistribution "
+        #        "subclass! Make sure your `action_distribution_fn` or "
+        #        "`make_model_and_action_dist` return a correct "
+        #        "distribution class.".format(dist_class.__name__))
+        action_dist = dist_class(dist_inputs, self.model)
+
+        # Get the exploration action from the forward results.
+        actions, logp = \
+            self.exploration.get_exploration_action(
+                action_distribution=action_dist,
+                timestep=timestep,
+                explore=explore)
 
         input_dict[SampleBatch.ACTIONS] = actions
 
@@ -231,6 +234,9 @@ def compute_actions_from_input_dict(
         # Update our global timestep by the batch size.
         self.global_timestep += len(input_dict[SampleBatch.CUR_OBS])
 
+        #TODO
+        print("action pass={}".format(time.time() - start))
+
         return convert_to_non_jax_type((actions, state_out, extra_fetches))
 
     @override(Policy)
diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py
index f81ac03ab872..97d357d640d7 100644
--- a/rllib/policy/torch_policy.py
+++ b/rllib/policy/torch_policy.py
@@ -180,6 +180,10 @@ def compute_actions_from_input_dict(
             **kwargs) -> \
             Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
 
+        import time
+        #TODO
+        start = time.time()
+
         explore = explore if explore is not None else self.config["explore"]
         timestep = timestep if timestep is not None else self.global_timestep
 
@@ -194,9 +198,14 @@ def compute_actions_from_input_dict(
             seq_lens = np.array([1] * len(input_dict["obs"])) \
                 if state_batches else None
 
-            return self._compute_action_helper(input_dict, state_batches,
+            ret = self._compute_action_helper(input_dict, state_batches,
                                                seq_lens, explore, timestep)
 
+            #TODO
+            print("action pass={}".format(time.time() - start))
+
+            return ret
+
     def _compute_action_helper(self, input_dict, state_batches, seq_lens,
                                explore, timestep):
         """Shared forward pass logic (w/ and w/o trajectory view API).
@@ -210,48 +219,48 @@ def _compute_action_helper(self, input_dict, state_batches, seq_lens,
         if self.model:
             self.model.eval()
 
-        if self.action_sampler_fn:
-            action_dist = dist_inputs = None
-            actions, logp, state_out = self.action_sampler_fn(
-                self,
-                self.model,
-                input_dict,
-                state_batches,
-                explore=explore,
-                timestep=timestep)
-        else:
-            # Call the exploration before_compute_actions hook.
-            self.exploration.before_compute_actions(
-                explore=explore, timestep=timestep)
-            if self.action_distribution_fn:
-                dist_inputs, dist_class, state_out = \
-                    self.action_distribution_fn(
-                        self,
-                        self.model,
-                        input_dict[SampleBatch.CUR_OBS],
-                        explore=explore,
-                        timestep=timestep,
-                        is_training=False)
-            else:
-                dist_class = self.dist_class
-                dist_inputs, state_out = self.model(input_dict, state_batches,
-                                                    seq_lens)
-
-            if not (isinstance(dist_class, functools.partial)
-                    or issubclass(dist_class, TorchDistributionWrapper)):
-                raise ValueError(
-                    "`dist_class` ({}) not a TorchDistributionWrapper "
-                    "subclass! Make sure your `action_distribution_fn` or "
-                    "`make_model_and_action_dist` return a correct "
-                    "distribution class.".format(dist_class.__name__))
-            action_dist = dist_class(dist_inputs, self.model)
+        #if self.action_sampler_fn:
+        #    action_dist = dist_inputs = None
+        #    actions, logp, state_out = self.action_sampler_fn(
+        #        self,
+        #        self.model,
+        #        input_dict,
+        #        state_batches,
+        #        explore=explore,
+        #        timestep=timestep)
+        #else:
+        # Call the exploration before_compute_actions hook.
+        #self.exploration.before_compute_actions(
+        #    explore=explore, timestep=timestep)
+        #if self.action_distribution_fn:
+        #    dist_inputs, dist_class, state_out = \
+        #        self.action_distribution_fn(
+        #            self,
+        #            self.model,
+        #            input_dict[SampleBatch.CUR_OBS],
+        #            explore=explore,
+        #            timestep=timestep,
+        #            is_training=False)
+        #else:
+        dist_class = self.dist_class
+        dist_inputs, state_out = self.model(input_dict, state_batches,
+                                            seq_lens)
 
-            # Get the exploration action from the forward results.
-            actions, logp = \
-                self.exploration.get_exploration_action(
-                    action_distribution=action_dist,
-                    timestep=timestep,
-                    explore=explore)
+        #if not (isinstance(dist_class, functools.partial)
+        #        or issubclass(dist_class, TorchDistributionWrapper)):
+        #    raise ValueError(
+        #        "`dist_class` ({}) not a TorchDistributionWrapper "
+        #        "subclass! Make sure your `action_distribution_fn` or "
+        #        "`make_model_and_action_dist` return a correct "
+        #        "distribution class.".format(dist_class.__name__))
+        action_dist = dist_class(dist_inputs, self.model)
+
+        # Get the exploration action from the forward results.
+        actions, logp = \
+            self.exploration.get_exploration_action(
+                action_distribution=action_dist,
+                timestep=timestep,
+                explore=explore)
 
         input_dict[SampleBatch.ACTIONS] = actions
 

From eaedcb09d9a04fd4fb1c09150fc2da6e59a4e003 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 28 Jan 2021 11:57:13 +0100
Subject: [PATCH 16/16] wip.

---
 rllib/agents/ppo/ppo.py             |   5 +-
 rllib/agents/ppo/ppo_jax_policy.py  | 154 +++++++++++++++-------------
 rllib/agents/ppo/tests/test_ppo.py  |   4 +-
 rllib/evaluation/postprocessing.py  |   7 +-
 rllib/execution/rollout_ops.py      |   5 +-
 rllib/models/jax/fcnet.py           |  46 +++++----
 rllib/models/jax/jax_action_dist.py |  32 ++++--
 rllib/models/jax/jax_modelv2.py     |   6 ++
 rllib/models/modelv2.py             |  10 +-
 rllib/policy/jax_policy.py          |  18 ++--
 rllib/policy/policy.py              |   1 +
 rllib/utils/jax_ops.py              |  35 +++++++
 rllib/utils/sgd.py                  |   5 +-
 13 files changed, 214 insertions(+), 114 deletions(-)

diff --git a/rllib/agents/ppo/ppo.py b/rllib/agents/ppo/ppo.py
index 25a66b21e76c..807b824b4a23 100644
--- a/rllib/agents/ppo/ppo.py
+++ b/rllib/agents/ppo/ppo.py
@@ -190,9 +190,10 @@ def update(pi, pi_id):
             assert "kl" not in fetches, (
                 "kl should be nested under policy id key", fetches)
             if pi_id in fetches:
-                assert "kl" in fetches[pi_id], (fetches, pi_id)
+                #assert "kl" in fetches[pi_id], (fetches, pi_id)
                 # Make the actual `Policy.update_kl()` call.
-                pi.update_kl(fetches[pi_id]["kl"])
+                if "kl" in fetches[pi_id]:#TODO
+                    pi.update_kl(fetches[pi_id]["kl"])
             else:
                 logger.warning("No data for {}, not updating kl".format(pi_id))
 
diff --git a/rllib/agents/ppo/ppo_jax_policy.py b/rllib/agents/ppo/ppo_jax_policy.py
index c97c5b59d3dd..8e97eb6e0af0 100644
--- a/rllib/agents/ppo/ppo_jax_policy.py
+++ b/rllib/agents/ppo/ppo_jax_policy.py
@@ -11,7 +11,8 @@
     setup_config
 from ray.rllib.agents.ppo.ppo_torch_policy import vf_preds_fetches, \
     KLCoeffMixin, kl_and_loss_stats
-from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.evaluation.postprocessing import compute_gae_for_sample_batch, \
+    Postprocessing
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
 from ray.rllib.policy.jax_policy import LearningRateSchedule
@@ -50,71 +51,86 @@ def ppo_surrogate_loss(
         Union[TensorType, List[TensorType]]: A single loss tensor or a list
             of loss tensors.
     """
-    if vars:
-        for k, v in vars.items():
-            setattr(model, k, v)
-
-    logits, state = model.from_batch(train_batch, is_training=True)
-    curr_action_dist = dist_class(logits, model)
-
-    prev_action_dist = dist_class(train_batch[SampleBatch.ACTION_DIST_INPUTS],
-                                  model)
-
-    logp_ratio = jnp.exp(
-        curr_action_dist.logp(train_batch[SampleBatch.ACTIONS]) -
-        train_batch[SampleBatch.ACTION_LOGP])
-    action_kl = prev_action_dist.kl(curr_action_dist)
-    mean_kl = jnp.mean(action_kl)
-
-    curr_entropy = curr_action_dist.entropy()
-    mean_entropy = jnp.mean(curr_entropy)
-
-    surrogate_loss = jnp.minimum(
-        train_batch[Postprocessing.ADVANTAGES] * logp_ratio,
-        train_batch[Postprocessing.ADVANTAGES] * jnp.clip(
-            logp_ratio, 1 - policy.config["clip_param"],
-            1 + policy.config["clip_param"]))
-    mean_policy_loss = jnp.mean(-surrogate_loss)
-
-    if policy.config["use_gae"]:
-        prev_value_fn_out = train_batch[SampleBatch.VF_PREDS]
-        value_fn_out = model.value_function()
-        vf_loss1 = jnp.square(value_fn_out -
-                              train_batch[Postprocessing.VALUE_TARGETS])
-        vf_clipped = prev_value_fn_out + jnp.clip(
-            value_fn_out - prev_value_fn_out, -policy.config["vf_clip_param"],
-            policy.config["vf_clip_param"])
-        vf_loss2 = jnp.square(vf_clipped -
-                              train_batch[Postprocessing.VALUE_TARGETS])
-        vf_loss = jnp.maximum(vf_loss1, vf_loss2)
-        mean_vf_loss = jnp.mean(vf_loss)
-        total_loss = jnp.mean(-surrogate_loss + policy.kl_coeff * action_kl +
-                              policy.config["vf_loss_coeff"] * vf_loss -
-                              policy.entropy_coeff * curr_entropy)
-    else:
-        mean_vf_loss = 0.0
-        total_loss = jnp.mean(-surrogate_loss + policy.kl_coeff * action_kl -
-                              policy.entropy_coeff * curr_entropy)
+    def loss_(train_batch, vars=None):
+        #if vars:
+        #    for k, v in vars.items():
+        #        setattr(model, k, v)
+
+        logits, value_out, state = model.forward_(train_batch["obs"])
+        curr_action_dist = dist_class(logits, None)
+
+        prev_action_dist = dist_class(
+            train_batch[SampleBatch.ACTION_DIST_INPUTS], None)
+
+        logp_ratio = jnp.exp(
+            curr_action_dist.logp(train_batch[SampleBatch.ACTIONS]) -
+            train_batch[SampleBatch.ACTION_LOGP])
+        action_kl = prev_action_dist.kl(curr_action_dist)
+        policy._mean_kl = jnp.mean(action_kl)
+
+        curr_entropy = curr_action_dist.entropy()
+        policy._mean_entropy = jnp.mean(curr_entropy)
+
+        surrogate_loss = jnp.minimum(
+            train_batch[Postprocessing.ADVANTAGES] * logp_ratio,
+            train_batch[Postprocessing.ADVANTAGES] * jnp.clip(
+                logp_ratio, 1 - policy.config["clip_param"],
+                1 + policy.config["clip_param"]))
+        policy._mean_policy_loss = jnp.mean(-surrogate_loss)
+
+        if policy.config["use_gae"]:
+            prev_value_fn_out = train_batch[SampleBatch.VF_PREDS]
+            #value_fn_out = model.value_function()
+            vf_loss1 = jnp.square(value_out -
+                                  train_batch[Postprocessing.VALUE_TARGETS])
+            vf_clipped = prev_value_fn_out + jnp.clip(
+                value_out - prev_value_fn_out, -policy.config["vf_clip_param"],
+                policy.config["vf_clip_param"])
+            vf_loss2 = jnp.square(vf_clipped -
+                                  train_batch[Postprocessing.VALUE_TARGETS])
+            vf_loss = jnp.maximum(vf_loss1, vf_loss2)
+            policy._mean_vf_loss = jnp.mean(vf_loss)
+            total_loss = jnp.mean(-surrogate_loss + policy.kl_coeff * action_kl +
+                                  policy.config["vf_loss_coeff"] * vf_loss -
+                                  policy.entropy_coeff * curr_entropy)
+        else:
+            policy._mean_vf_loss = 0.0
+            total_loss = jnp.mean(-surrogate_loss + policy.kl_coeff * action_kl -
+                                  policy.entropy_coeff * curr_entropy)
+        #policy._value_out = value_out
+        #policy._total_loss = total_loss
+
+        #policy._vf_explained_var = explained_variance(
+        #    train_batch[Postprocessing.VALUE_TARGETS],
+        #    value_out)  # policy.model.value_function()
+
+        #if vars:
+        #    policy._total_loss = policy._total_loss.primal
+        #    policy._mean_policy_loss = policy._mean_policy_loss.primal
+        #    policy._mean_vf_loss = policy._mean_vf_loss.primal
+        #    policy._vf_explained_var = policy._vf_explained_var.primal
+        #    policy._mean_entropy = policy._mean_entropy.primal
+        #    policy._mean_kl = policy._mean_kl.primal
+
+        return total_loss #, mean_policy_loss, mean_vf_loss, value_out, mean_entropy, mean_kl
+
+    if not hasattr(policy, "jit_loss"):
+        policy.jit_loss = jax.jit(loss_)
+        policy.gradient_loss = jax.grad(policy.jit_loss, argnums=1)# 4
+
+    #policy._total_loss = policy.jit_loss(train_batch["obs"], vars)
 
     # Store stats in policy for stats_fn.
-    policy._total_loss = total_loss
-    policy._mean_policy_loss = mean_policy_loss
-    policy._mean_vf_loss = mean_vf_loss
-    policy._vf_explained_var = explained_variance(
-        train_batch[Postprocessing.VALUE_TARGETS],
-        policy.model.value_function())
-    policy._mean_entropy = mean_entropy
-    policy._mean_kl = mean_kl
-
-    if vars:
-        policy._total_loss = policy._total_loss.primal
-        policy._mean_policy_loss = policy._mean_policy_loss.primal
-        policy._mean_vf_loss = policy._mean_vf_loss.primal
-        policy._vf_explained_var = policy._vf_explained_var.primal
-        policy._mean_entropy = policy._mean_entropy.primal
-        policy._mean_kl = policy._mean_kl.primal
-
-    return total_loss
+    #policy._total_loss = total_loss
+    #policy._mean_policy_loss = mean_policy_loss
+    #policy._mean_vf_loss = mean_vf_loss
+    #policy._vf_explained_var = explained_variance(
+    #    train_batch[Postprocessing.VALUE_TARGETS],
+    #    policy._value_out) #policy.model.value_function()
+    #policy._mean_entropy = mean_entropy
+    #policy._mean_kl = mean_kl
+
+    ret = policy.gradient_loss({k: train_batch[k] for k, v in train_batch.items()}, vars)
 
 
 class ValueNetworkMixin:
@@ -139,10 +155,10 @@ def __init__(self, obs_space, action_space, config):
             assert config["_use_trajectory_view_api"]
 
             def value(**input_dict):
-                model_out, _ = self.model.from_batch(
+                _, value_out, _ = self.model.from_batch(
                     input_dict, is_training=False)
                 # [0] = remove the batch dim.
-                return self.model.value_function()[0]
+                return value_out[0]
 
         # When not doing GAE, we do not require the value function's output.
         else:
@@ -178,9 +194,9 @@ def setup_mixins(policy: Policy, obs_space: gym.spaces.Space,
     framework="jax",
     get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
     loss_fn=ppo_surrogate_loss,
-    stats_fn=kl_and_loss_stats,
-    extra_action_out_fn=vf_preds_fetches,
-    postprocess_fn=postprocess_ppo_gae,
+    #stats_fn=kl_and_loss_stats,
+    #extra_action_out_fn=vf_preds_fetches,
+    postprocess_fn=compute_gae_for_sample_batch,
     extra_grad_process_fn=apply_grad_clipping,
     before_init=setup_config,
     before_loss_init=setup_mixins,
diff --git a/rllib/agents/ppo/tests/test_ppo.py b/rllib/agents/ppo/tests/test_ppo.py
index 2e6144247f43..ded2ace48490 100644
--- a/rllib/agents/ppo/tests/test_ppo.py
+++ b/rllib/agents/ppo/tests/test_ppo.py
@@ -73,7 +73,7 @@ def on_train_result(self, *, trainer, result: dict, **kwargs):
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init(local_mode=True)
+        ray.init(local_mode=True)#TODO
 
     @classmethod
     def tearDownClass(cls):
@@ -94,7 +94,7 @@ def test_ppo_compilation_and_lr_schedule(self):
         num_iterations = 1#TODO:2
 
         for fw in framework_iterator(
-                config, frameworks=("jax", "torch")):#TODO, "tf2", "tf", "torch")):
+                config, frameworks=("jax")):#TODO, "tf2", "tf", "torch")):
             envs = ["CartPole-v0"]
             #if fw != "jax":
             #    envs.append("MsPacmanNoFrameskip-v4")
diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index 7d1801cf6566..c29a33fdfff2 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -40,8 +40,11 @@ def compute_advantages(rollout: SampleBatch,
             processed rewards.
     """
 
-    assert SampleBatch.VF_PREDS in rollout or not use_critic, \
-        "use_critic=True but values not found"
+    try:#TODO
+        assert SampleBatch.VF_PREDS in rollout or not use_critic, \
+            "use_critic=True but values not found"
+    except Exception as e:
+        raise e
     assert use_critic or not use_gae, \
         "Can't use gae without using a value function"
 
diff --git a/rllib/execution/rollout_ops.py b/rllib/execution/rollout_ops.py
index baaa2635795c..3237d89f88da 100644
--- a/rllib/execution/rollout_ops.py
+++ b/rllib/execution/rollout_ops.py
@@ -246,7 +246,10 @@ def __call__(self, samples: SampleBatchType) -> SampleBatchType:
         for policy_id in samples.policy_batches:
             batch = samples.policy_batches[policy_id]
             for field in self.fields:
-                batch[field] = standardized(batch[field])
+                try:#TODO
+                    batch[field] = standardized(batch[field])
+                except Exception as e:
+                    raise e
 
         if wrapped:
             samples = samples.policy_batches[DEFAULT_POLICY_ID]
diff --git a/rllib/models/jax/fcnet.py b/rllib/models/jax/fcnet.py
index 4a6eab7f8f9e..74b59fb1d08c 100644
--- a/rllib/models/jax/fcnet.py
+++ b/rllib/models/jax/fcnet.py
@@ -109,34 +109,44 @@ def forward(self, input_dict, state, seq_lens):
 
         if not hasattr(self, "forward_"):
             def forward_(flat_in):
-                self._last_flat_in = self._features = flat_in
+                self._last_flat_in = self._features = flat_in#input_dict["obs_flat"]#flat_in
                 if self._hidden_layers:
                     self._features = self._hidden_layers.apply(
                         self._hidden_layers_params, self._last_flat_in)
                 logits = self._logits.apply(self._logits_params, self._features) if \
                     self._logits else self._features
-                return logits, state
 
-            self.forward_ = jax.jit(forward_)
-
-        return self.forward_(input_dict["obs_flat"])
-
-    @override(JAXModelV2)
-    def value_function(self):
-        assert self._features is not None, "must call forward() first"
-
-        if not hasattr(self, "value_function_"):
-            def value_function_():
                 if self._value_branch_separate:
                     x = self._value_branch_separate.apply(
                         self._value_branch_separate_params, self._last_flat_in)
-                    return self._value_branch.apply(self._value_branch_params,
+                    value_out = self._value_branch.apply(self._value_branch_params,
                                                     x).squeeze(1)
                 else:
-                    return self._value_branch.apply(self._value_branch_params,
-                                                    self._features).squeeze(1)
+                    value_out = self._value_branch.apply(self._value_branch_params,
+                                                         self._features).squeeze(1)
+
+                return logits, value_out, state
+
+            self.forward_ = forward_
+            self.jit_forward = jax.jit(forward_)
+
+        return self.jit_forward(input_dict["obs_flat"])
+
+    #@override(JAXModelV2)
+    #def value_function(self):
+    #    assert self._features is not None, "must call forward() first"
+
+    #    if not hasattr(self, "value_function_"):
+    #        def value_function_():
+    #            if self._value_branch_separate:
+    #                x = self._value_branch_separate.apply(
+    #                    self._value_branch_separate_params, self._last_flat_in)
+    #                return self._value_branch.apply(self._value_branch_params,
+    #                                                x).squeeze(1)
+    #            else:
+    #                return self._value_branch.apply(self._value_branch_params,
+    #                                                self._features).squeeze(1)
 
-            self.value_function_ = jax.jit(value_function_)
+    #        self.value_function_ = jax.jit(value_function_)
 
-        return self.value_function_()
-        
\ No newline at end of file
+    #    return self.value_function_()
diff --git a/rllib/models/jax/jax_action_dist.py b/rllib/models/jax/jax_action_dist.py
index dd8309cb73de..3e9d07653734 100644
--- a/rllib/models/jax/jax_action_dist.py
+++ b/rllib/models/jax/jax_action_dist.py
@@ -8,6 +8,9 @@
 
 jax, flax = try_import_jax()
 tfp = try_import_tfp()
+jnp = None
+if jax:
+    from jax import numpy as jnp
 
 
 class JAXDistribution(ActionDistribution):
@@ -33,13 +36,6 @@ def entropy(self) -> TensorType:
     def kl(self, other: ActionDistribution) -> TensorType:
         return self.dist.kl_divergence(other.dist)
 
-    @override(ActionDistribution)
-    def sample(self) -> TensorType:
-        # Update the state of our PRNG.
-        _, self.prng_key = jax.random.split(self.prng_key)
-        self.last_sample = jax.random.categorical(self.prng_key, self.inputs)
-        return self.last_sample
-
     @override(ActionDistribution)
     def sampled_action_logp(self) -> TensorType:
         assert self.last_sample is not None
@@ -59,11 +55,33 @@ def __init__(self, inputs, model=None, temperature=1.0):
         self.dist = tfp.experimental.substrates.jax.distributions.Categorical(
             logits=self.inputs)
 
+    @override(ActionDistribution)
+    def sample(self) -> TensorType:
+        # Update the state of our PRNG.
+        _, self.prng_key = jax.random.split(self.prng_key)
+        self.last_sample = jax.random.categorical(self.prng_key, self.inputs)
+        return self.last_sample
+
     @override(ActionDistribution)
     def deterministic_sample(self):
         self.last_sample = self.inputs.argmax(axis=1)
         return self.last_sample
 
+    @override(JAXDistribution)
+    def entropy(self) -> TensorType:
+        m = jnp.max(self.inputs, axis=-1, keepdims=True)
+        x = self.inputs - m
+        sum_exp_x = jnp.sum(jnp.exp(x), axis=-1)
+        lse_logits = m[..., 0] + jnp.log(sum_exp_x)
+        is_inf_logits = jnp.isinf(self.inputs).astype(jnp.float32)
+        is_negative_logits = (self.inputs < 0).astype(jnp.float32)
+        masked_logits = jnp.where(
+            (is_inf_logits * is_negative_logits).astype(jnp.bool_),
+            jnp.array(1.0).astype(self.inputs.dtype), self.inputs)
+
+        return lse_logits - jnp.sum(
+            jnp.multiply(masked_logits, jnp.exp(x)), axis=-1) / sum_exp_x
+
     @staticmethod
     @override(ActionDistribution)
     def required_model_output_shape(action_space, model_config):
diff --git a/rllib/models/jax/jax_modelv2.py b/rllib/models/jax/jax_modelv2.py
index e081a9b55b36..0f647581877e 100644
--- a/rllib/models/jax/jax_modelv2.py
+++ b/rllib/models/jax/jax_modelv2.py
@@ -58,3 +58,9 @@ def trainable_variables(
             self, as_dict: bool = False
     ) -> Union[List[TensorType], Dict[str, TensorType]]:
         return self.variables(as_dict=as_dict)
+
+    @PublicAPI
+    @override(ModelV2)
+    def value_function(self) -> TensorType:
+        raise ValueError("JAXModelV2 does not have a separate "
+                         "`value_function()` call!")
diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py
index 70ad50202421..e272dc0d5b0c 100644
--- a/rllib/models/modelv2.py
+++ b/rllib/models/modelv2.py
@@ -210,11 +210,11 @@ def __call__(
         with self.context():
             res = self.forward(restored, state or [], seq_lens)
         if ((not isinstance(res, list) and not isinstance(res, tuple))
-                or len(res) != 2):
+                or len(res) not in [2, 3]):
             raise ValueError(
-                "forward() must return a tuple of (output, state) tensors, "
-                "got {}".format(res))
-        outputs, state = res
+                "forward() must return a tuple of (output, [value-out]?, "
+                "state) tensors, got {}".format(res))
+        outputs, state = res[0], res[-1]
 
         try:
             shape = outputs.shape
@@ -229,7 +229,7 @@ def __call__(
             raise ValueError("State output is not a list: {}".format(state))
 
         self._last_output = outputs
-        return outputs, state
+        return res
 
     @PublicAPI
     def from_batch(self, train_batch: SampleBatch,
diff --git a/rllib/policy/jax_policy.py b/rllib/policy/jax_policy.py
index 103827f24912..bcc5ceef7887 100644
--- a/rllib/policy/jax_policy.py
+++ b/rllib/policy/jax_policy.py
@@ -13,7 +13,8 @@
 from ray.rllib.utils import force_list
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.framework import try_import_jax
-from ray.rllib.utils.jax_ops import convert_to_non_jax_type
+from ray.rllib.utils.jax_ops import convert_to_jax_device_array, \
+    convert_to_non_jax_type
 from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
 from ray.rllib.utils.tracking_dict import UsageTrackingDict
 from ray.rllib.utils.typing import ModelGradients, ModelWeights, \
@@ -103,7 +104,7 @@ def __init__(
         self.exploration = self._create_exploration()
         self.unwrapped_model = model  # used to support DistributedDataParallel
         self._loss = loss
-        self._gradient_loss = jax.grad(self._loss, argnums=4)
+        #self._gradient_loss = jax.grad(self._loss, argnums=4)
         self._optimizers = force_list(self.optimizer())
 
         self.dist_class = action_distribution_class
@@ -196,8 +197,8 @@ def compute_actions_from_input_dict(
         #            is_training=False)
         #else:
         dist_class = self.dist_class
-        dist_inputs, state_out = self.model(input_dict, state_batches,
-                                            seq_lens)
+        dist_inputs, value_out, state_out = self.model(
+            input_dict, state_batches, seq_lens)
 
         #if not (isinstance(dist_class, functools.partial)
         #        or issubclass(dist_class, JAXDistribution)):
@@ -220,6 +221,7 @@ def compute_actions_from_input_dict(
         # Add default and custom fetches.
         extra_fetches = self.extra_action_out(input_dict, state_batches,
                                               self.model, action_dist)
+        extra_fetches[SampleBatch.VF_PREDS] = value_out
 
         # Action-dist inputs.
         if dist_inputs is not None:
@@ -329,10 +331,10 @@ def compute_gradients(self,
 
         # Calculate the actual policy loss.
         all_grads = force_list(
-            self._gradient_loss(self, self.model, self.dist_class, train_batch,
-                                model_params))
+            self.gradient_loss({k: train_batch[k] for k in train_batch.keys() if k != "infos"}, model_params))#self, self.model, self.dist_class, train_batch,
+                               #model_params))
 
-        # assert not any(torch.isnan(l) for l in loss_out)
+        #remove: assert not any(torch.isnan(l) for l in loss_out)
         fetches = self.extra_compute_grad_fetches()
 
         # Loop through all optimizers.
@@ -511,6 +513,8 @@ def import_model_from_h5(self, import_file: str) -> None:
 
     def _lazy_tensor_dict(self, data):
         tensor_dict = UsageTrackingDict(data)
+        tensor_dict.set_get_interceptor(
+            functools.partial(convert_to_jax_device_array, device=None))
         return tensor_dict
 
 
diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py
index 577ac3d68c75..380ef93b163d 100644
--- a/rllib/policy/policy.py
+++ b/rllib/policy/policy.py
@@ -588,6 +588,7 @@ def _get_default_view_requirements(self):
             SampleBatch.EPS_ID: ViewRequirement(),
             SampleBatch.UNROLL_ID: ViewRequirement(),
             SampleBatch.AGENT_INDEX: ViewRequirement(),
+            SampleBatch.VF_PREDS: ViewRequirement(),
             "t": ViewRequirement(),
         }
 
diff --git a/rllib/utils/jax_ops.py b/rllib/utils/jax_ops.py
index f37589956c4d..a51d6b57492a 100644
--- a/rllib/utils/jax_ops.py
+++ b/rllib/utils/jax_ops.py
@@ -9,6 +9,41 @@
     import jax.numpy as jnp
 
 
+def convert_to_jax_device_array(x, device=None):
+    """Converts any struct to jax.numpy.DeviceArray.
+
+    x (any): Any (possibly nested) struct, the values in which will be
+        converted and returned as a new struct with all leaves converted
+        to torch tensors.
+
+    Returns:
+        Any: A new struct with the same structure as `stats`, but with all
+            values converted to jax.numpy.DeviceArray types.
+    """
+
+    def mapping(item):
+        # Already JAX DeviceArray -> make sure it's on right device.
+        if isinstance(item, jnp.DeviceArray):
+            return item if device is None else item.to(device)
+        # Numpy arrays.
+        if isinstance(item, np.ndarray):
+            # np.object_ type (e.g. info dicts in train batch): leave as-is.
+            if item.dtype == np.object_:
+                return item
+            # Already numpy: Wrap as torch tensor.
+            else:
+                tensor = jnp.array(item)
+        # Everything else: Convert to numpy, then wrap as torch tensor.
+        else:
+            tensor = jnp.asarray(item)
+        # Floatify all float64 tensors.
+        if tensor.dtype == jnp.double:
+            tensor = tensor.astype(jnp.float32)
+        return tensor if device is None else tensor.to(device)
+
+    return tree.map_structure(mapping, x)
+
+
 def convert_to_non_jax_type(stats):
     """Converts values in `stats` to non-JAX numpy or python types.
 
diff --git a/rllib/utils/sgd.py b/rllib/utils/sgd.py
index b5b72d44d37c..678c0afbc5a4 100644
--- a/rllib/utils/sgd.py
+++ b/rllib/utils/sgd.py
@@ -27,7 +27,10 @@ def averaged(kv, axis=None):
     out = {}
     for k, v in kv.items():
         if v[0] is not None and not isinstance(v[0], dict):
-            out[k] = np.mean(v, axis=axis)
+            try:#TODO
+                out[k] = np.mean(v, axis=axis)
+            except Exception as e:
+                raise e
         else:
             out[k] = v[0]
     return out