diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ea970c265..b45ae3192 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -32,8 +32,6 @@ jobs:
pip install .[extra,tests,docs]
# Use headless version
pip install opencv-python-headless
- # Tmp fix: ROM missing in the newest atari-py version
- pip install atari-py==0.2.5
- name: Build the doc
run: |
make doc
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 63f9eafa0..45ca8f56a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -7,8 +7,6 @@ type-check:
pytest:
script:
- python --version
- # Fix to get atari ROMs
- - pip install atari-py==0.2.5
# MKL_THREADING_LAYER=GNU to avoid MKL_THREADING_LAYER=INTEL incompatibility error
- MKL_THREADING_LAYER=GNU make pytest
diff --git a/README.md b/README.md
index 54577e452..2a0701c1a 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ Documentation is available online: [https://sb3-contrib.readthedocs.io/](https:/
**Note:** Stable-Baselines3 supports PyTorch >= 1.8.1.
### Prerequisites
-Stable Baselines3 requires python 3.7+.
+Stable Baselines3 requires Python 3.7+.
#### Windows 10
diff --git a/docs/guide/callbacks.rst b/docs/guide/callbacks.rst
index 279664171..19bccb22c 100644
--- a/docs/guide/callbacks.rst
+++ b/docs/guide/callbacks.rst
@@ -174,7 +174,7 @@ and optionally a prefix for the checkpoints (``rl_model`` by default).
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
name_prefix='rl_model')
- model = SAC('MlpPolicy', 'Pendulum-v0')
+ model = SAC('MlpPolicy', 'Pendulum-v1')
model.learn(2000, callback=checkpoint_callback)
@@ -206,13 +206,13 @@ It will save the best model if ``best_model_save_path`` folder is specified and
from stable_baselines3.common.callbacks import EvalCallback
# Separate evaluation env
- eval_env = gym.make('Pendulum-v0')
+ eval_env = gym.make('Pendulum-v1')
# Use deterministic actions for evaluation
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/',
log_path='./logs/', eval_freq=500,
deterministic=True, render=False)
- model = SAC('MlpPolicy', 'Pendulum-v0')
+ model = SAC('MlpPolicy', 'Pendulum-v1')
model.learn(5000, callback=eval_callback)
@@ -234,13 +234,13 @@ Alternatively, you can pass directly a list of callbacks to the ``learn()`` meth
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/')
# Separate evaluation env
- eval_env = gym.make('Pendulum-v0')
+ eval_env = gym.make('Pendulum-v1')
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model',
log_path='./logs/results', eval_freq=500)
# Create the callback list
callback = CallbackList([checkpoint_callback, eval_callback])
- model = SAC('MlpPolicy', 'Pendulum-v0')
+ model = SAC('MlpPolicy', 'Pendulum-v1')
# Equivalent to:
# model.learn(5000, callback=[checkpoint_callback, eval_callback])
model.learn(5000, callback=callback)
@@ -263,12 +263,12 @@ It must be used with the :ref:`EvalCallback` and use the event triggered by a ne
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
# Separate evaluation env
- eval_env = gym.make('Pendulum-v0')
+ eval_env = gym.make('Pendulum-v1')
# Stop training when the model reaches the reward threshold
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1)
eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1)
- model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1)
+ model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1)
# Almost infinite number of timesteps, but the training will stop
# early as soon as the reward threshold is reached
model.learn(int(1e10), callback=eval_callback)
@@ -299,7 +299,7 @@ An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` t
checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/')
event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)
- model = PPO('MlpPolicy', 'Pendulum-v0', verbose=1)
+ model = PPO('MlpPolicy', 'Pendulum-v1', verbose=1)
model.learn(int(2e4), callback=event_callback)
@@ -328,7 +328,7 @@ and in total for ``max_episodes * n_envs`` episodes.
# Stops training when the model reaches the maximum number of episodes
callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1)
- model = A2C('MlpPolicy', 'Pendulum-v0', verbose=1)
+ model = A2C('MlpPolicy', 'Pendulum-v1', verbose=1)
# Almost infinite number of timesteps, but the training will stop
# early as soon as the max number of episodes is reached
model.learn(int(1e10), callback=callback_max_episodes)
diff --git a/docs/guide/custom_policy.rst b/docs/guide/custom_policy.rst
index d17f913c5..1b8f9fb7f 100644
--- a/docs/guide/custom_policy.rst
+++ b/docs/guide/custom_policy.rst
@@ -407,5 +407,5 @@ you only need to specify ``net_arch=[256, 256]`` (here, two hidden layers of 256
# Custom critic architecture with two layers of 400 and 300 units
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[400, 300]))
# Create the agent
- model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs, verbose=1)
+ model = SAC("MlpPolicy", "Pendulum-v1", policy_kwargs=policy_kwargs, verbose=1)
model.learn(5000)
diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
index 733279bc3..a5b56b249 100644
--- a/docs/guide/examples.rst
+++ b/docs/guide/examples.rst
@@ -321,7 +321,7 @@ Atari Games
Training a RL agent on Atari games is straightforward thanks to ``make_atari_env`` helper function.
It will do `all the preprocessing `_
-and multiprocessing for you.
+and multiprocessing for you. To install the Atari environments, run the command ``pip install gym[atari, accept-rom-license]`` to install the Atari environments and ROMs, or install Stable Baselines3 with ``pip install stable-baselines3[extra]`` to install this and other optional dependencies.
.. image:: ../_static/img/colab-badge.svg
:target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/atari_games.ipynb
@@ -564,7 +564,7 @@ Behind the scene, SB3 uses an :ref:`EvalCallback `.
# Create the model, the training environment
# and the test environment (for evaluation)
- model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1,
+ model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1,
learning_rate=1e-3, create_eval_env=True)
# Evaluate the model every 1000 steps on 5 test episodes
diff --git a/docs/guide/export.rst b/docs/guide/export.rst
index d6fe72a78..b6884c19d 100644
--- a/docs/guide/export.rst
+++ b/docs/guide/export.rst
@@ -62,7 +62,7 @@ For PPO, assuming a shared feature extactor.
action_hidden, value_hidden = self.extractor(observation)
return self.action_net(action_hidden), self.value_net(value_hidden)
- # Example: model = PPO("MlpPolicy", "Pendulum-v0")
+ # Example: model = PPO("MlpPolicy", "Pendulum-v1")
model = PPO.load("PathToTrainedModel.zip")
model.policy.to("cpu")
onnxable_model = OnnxablePolicy(model.policy.mlp_extractor, model.policy.action_net, model.policy.value_net)
diff --git a/docs/guide/tensorboard.rst b/docs/guide/tensorboard.rst
index 0929b9eef..18f1cebc4 100644
--- a/docs/guide/tensorboard.rst
+++ b/docs/guide/tensorboard.rst
@@ -61,7 +61,7 @@ Here is a simple example on how to log both additional tensor or arbitrary scala
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
- model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1)
+ model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1)
class TensorboardCallback(BaseCallback):
@@ -104,7 +104,7 @@ Here is an example of how to render an image to TensorBoard at regular intervals
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import Image
- model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1)
+ model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1)
class ImageRecorderCallback(BaseCallback):
@@ -141,7 +141,7 @@ Here is an example of how to store a plot in TensorBoard at regular intervals:
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import Figure
- model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1)
+ model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1)
class FigureRecorderCallback(BaseCallback):
@@ -251,7 +251,7 @@ can get direct access to the underlying SummaryWriter in a callback:
- model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1)
+ model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1)
class SummaryWriterCallback(BaseCallback):
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 39830590d..07a0cf339 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -119,6 +119,7 @@ Release 1.3.0 (2021-10-23)
Breaking Changes:
^^^^^^^^^^^^^^^^^
+- Support for Python 3.6 was removed.
- ``sde_net_arch`` argument in policies is deprecated and will be removed in a future version.
- ``_get_latent`` (``ActorCriticPolicy``) was removed
- All logging keys now use underscores instead of spaces (@timokau). Concretely this changes:
@@ -127,6 +128,7 @@ Breaking Changes:
- ``rollout/exploration rate`` to ``rollout/exploration_rate`` and
- ``rollout/success rate`` to ``rollout/success_rate``.
+
New Features:
^^^^^^^^^^^^^
- Added methods ``get_distribution`` and ``predict_values`` for ``ActorCriticPolicy`` for A2C/PPO/TRPO (@cyprienc)
@@ -145,6 +147,7 @@ Bug Fixes:
Deprecations:
^^^^^^^^^^^^^
+- Switched minimum Gym version to 0.21.0.
Others:
^^^^^^^
diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst
index 24d265f00..c484a1c93 100644
--- a/docs/modules/ddpg.rst
+++ b/docs/modules/ddpg.rst
@@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
- env = gym.make("Pendulum-v0")
+ env = gym.make("Pendulum-v1")
# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
diff --git a/docs/modules/sac.rst b/docs/modules/sac.rst
index a1156fd8c..e7f9057d5 100644
--- a/docs/modules/sac.rst
+++ b/docs/modules/sac.rst
@@ -73,7 +73,7 @@ This example is only to demonstrate the use of the library and its functions, an
from stable_baselines3 import SAC
- env = gym.make("Pendulum-v0")
+ env = gym.make("Pendulum-v1")
model = SAC("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000, log_interval=4)
diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst
index 3bc93d7a9..d039ae71c 100644
--- a/docs/modules/td3.rst
+++ b/docs/modules/td3.rst
@@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
- env = gym.make("Pendulum-v0")
+ env = gym.make("Pendulum-v1")
# The noise objects for TD3
n_actions = env.action_space.shape[-1]
diff --git a/setup.py b/setup.py
index 3e0f788af..eabf30c66 100644
--- a/setup.py
+++ b/setup.py
@@ -73,7 +73,7 @@
packages=[package for package in find_packages() if package.startswith("stable_baselines3")],
package_data={"stable_baselines3": ["py.typed", "version.txt"]},
install_requires=[
- "gym>=0.17,<0.20", # gym 0.20 breaks atari-py behavior
+ "gym>=0.21", # Remember to also update gym version in "extra" below when this changes
"numpy",
"torch>=1.8.1",
# For saving models
@@ -116,7 +116,7 @@
# For render
"opencv-python",
# For atari games,
- "atari_py==0.2.6",
+ "gym[atari,accept-rom-license]>=0.21",
"pillow",
# Tensorboard support
"tensorboard>=2.2.0",
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 56fc14109..e1f6d3869 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -75,7 +75,7 @@ def test_callbacks(tmp_path, model_class):
if model_class in [A2C, PPO]:
max_episodes = 1
n_envs = 2
- # Pendulum-v0 has a timelimit of 200 timesteps
+ # Pendulum-v1 has a timelimit of 200 timesteps
max_episode_length = 200
envs = make_vec_env(env_name, n_envs=n_envs, seed=0)
@@ -99,7 +99,7 @@ def select_env(model_class) -> str:
if model_class is DQN:
return "CartPole-v0"
else:
- return "Pendulum-v0"
+ return "Pendulum-v1"
def test_eval_callback_vec_env():
diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py
index 02fa7b4a0..e2d98fbd5 100644
--- a/tests/test_custom_policy.py
+++ b/tests/test_custom_policy.py
@@ -25,7 +25,7 @@ def test_flexible_mlp(model_class, net_arch):
@pytest.mark.parametrize("net_arch", [[], [4], [4, 4], dict(qf=[8], pi=[8, 4])])
@pytest.mark.parametrize("model_class", [SAC, TD3])
def test_custom_offpolicy(model_class, net_arch):
- _ = model_class("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=net_arch), learning_starts=100).learn(300)
+ _ = model_class("MlpPolicy", "Pendulum-v1", policy_kwargs=dict(net_arch=net_arch), learning_starts=100).learn(300)
@pytest.mark.parametrize("model_class", [A2C, PPO, SAC, TD3])
@@ -38,12 +38,12 @@ def test_custom_optimizer(model_class, optimizer_kwargs):
kwargs = dict(n_steps=64)
policy_kwargs = dict(optimizer_class=th.optim.AdamW, optimizer_kwargs=optimizer_kwargs, net_arch=[32])
- _ = model_class("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs, **kwargs).learn(300)
+ _ = model_class("MlpPolicy", "Pendulum-v1", policy_kwargs=policy_kwargs, **kwargs).learn(300)
def test_tf_like_rmsprop_optimizer():
policy_kwargs = dict(optimizer_class=RMSpropTFLike, net_arch=[32])
- _ = A2C("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs).learn(500)
+ _ = A2C("MlpPolicy", "Pendulum-v1", policy_kwargs=policy_kwargs).learn(500)
def test_dqn_custom_policy():
diff --git a/tests/test_deterministic.py b/tests/test_deterministic.py
index 3712fc21a..4c92d269f 100644
--- a/tests/test_deterministic.py
+++ b/tests/test_deterministic.py
@@ -13,7 +13,7 @@ def test_deterministic_training_common(algo):
rewards = [[], []]
# Smaller network
kwargs = {"policy_kwargs": dict(net_arch=[64])}
- env_id = "Pendulum-v0"
+ env_id = "Pendulum-v1"
if algo in [TD3, SAC]:
kwargs.update({"action_noise": NormalActionNoise(0.0, 0.1), "learning_starts": 100, "train_freq": 4})
else:
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
index b894dd478..3652b1850 100644
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@@ -43,7 +43,7 @@ def test_squashed_gaussian(model_class):
"""
Test run with squashed Gaussian (notably entropy computation)
"""
- model = model_class("MlpPolicy", "Pendulum-v0", use_sde=True, n_steps=64, policy_kwargs=dict(squash_output=True))
+ model = model_class("MlpPolicy", "Pendulum-v1", use_sde=True, n_steps=64, policy_kwargs=dict(squash_output=True))
model.learn(500)
gaussian_mean = th.rand(N_SAMPLES, N_ACTIONS)
@@ -57,10 +57,10 @@ def test_squashed_gaussian(model_class):
@pytest.fixture()
def dummy_model_distribution_obs_and_actions() -> Tuple[A2C, np.array, np.array]:
"""
- Fixture creating a Pendulum-v0 gym env, an A2C model and sampling 10 random observations and actions from the env
+ Fixture creating a Pendulum-v1 gym env, an A2C model and sampling 10 random observations and actions from the env
:return: A2C model, random observations, random actions
"""
- env = gym.make("Pendulum-v0")
+ env = gym.make("Pendulum-v1")
model = A2C("MlpPolicy", env, seed=23)
random_obs = np.array([env.observation_space.sample() for _ in range(10)])
random_actions = np.array([env.action_space.sample() for _ in range(10)])
diff --git a/tests/test_env_checker.py b/tests/test_env_checker.py
index 6364bd4ba..0b0a82d8f 100644
--- a/tests/test_env_checker.py
+++ b/tests/test_env_checker.py
@@ -11,14 +11,14 @@ class ActionDictTestEnv(gym.Env):
observation_space = Box(low=-1.0, high=2.0, shape=(3,), dtype=np.float32)
def step(self, action):
- observation = np.array([1.0, 1.5, 0.5])
+ observation = np.array([1.0, 1.5, 0.5], dtype=self.observation_space.dtype)
reward = 1
done = True
info = {}
return observation, reward, done, info
def reset(self):
- return np.array([1.0, 1.5, 0.5])
+ return np.array([1.0, 1.5, 0.5], dtype=self.observation_space.dtype)
def render(self, mode="human"):
pass
diff --git a/tests/test_envs.py b/tests/test_envs.py
index d0434773a..b859ed703 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -27,7 +27,7 @@
]
-@pytest.mark.parametrize("env_id", ["CartPole-v0", "Pendulum-v0"])
+@pytest.mark.parametrize("env_id", ["CartPole-v0", "Pendulum-v1"])
def test_env(env_id):
"""
Check that environmnent integrated in Gym pass the test.
@@ -38,9 +38,9 @@ def test_env(env_id):
with pytest.warns(None) as record:
check_env(env)
- # Pendulum-v0 will produce a warning because the action space is
+ # Pendulum-v1 will produce a warning because the action space is
# in [-2, 2] and not [-1, 1]
- if env_id == "Pendulum-v0":
+ if env_id == "Pendulum-v1":
assert len(record) == 1
else:
# The other environments must pass without warning
diff --git a/tests/test_predict.py b/tests/test_predict.py
index 436547b83..853f4d11d 100644
--- a/tests/test_predict.py
+++ b/tests/test_predict.py
@@ -43,7 +43,7 @@ def test_auto_wrap(model_class):
if model_class is DQN:
env_name = "CartPole-v0"
else:
- env_name = "Pendulum-v0"
+ env_name = "Pendulum-v1"
env = gym.make(env_name)
eval_env = gym.make(env_name)
model = model_class("MlpPolicy", env)
@@ -51,7 +51,7 @@ def test_auto_wrap(model_class):
@pytest.mark.parametrize("model_class", MODEL_LIST)
-@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"])
+@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"])
@pytest.mark.parametrize("device", ["cpu", "cuda", "auto"])
def test_predict(model_class, env_id, device):
if device == "cuda" and not th.cuda.is_available():
diff --git a/tests/test_run.py b/tests/test_run.py
index 67b31c482..223776dfb 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -17,7 +17,7 @@ def test_deterministic_pg(model_class, action_noise):
"""
model = model_class(
"MlpPolicy",
- "Pendulum-v0",
+ "Pendulum-v1",
policy_kwargs=dict(net_arch=[64, 64]),
learning_starts=100,
verbose=1,
@@ -28,13 +28,13 @@ def test_deterministic_pg(model_class, action_noise):
model.learn(total_timesteps=300, eval_freq=250)
-@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"])
+@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"])
def test_a2c(env_id):
model = A2C("MlpPolicy", env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
model.learn(total_timesteps=1000, eval_freq=500)
-@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"])
+@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"])
@pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2])
def test_ppo(env_id, clip_range_vf):
if clip_range_vf is not None and clip_range_vf < 0:
@@ -67,7 +67,7 @@ def test_ppo(env_id, clip_range_vf):
def test_sac(ent_coef):
model = SAC(
"MlpPolicy",
- "Pendulum-v0",
+ "Pendulum-v1",
policy_kwargs=dict(net_arch=[64, 64]),
learning_starts=100,
verbose=1,
@@ -84,7 +84,7 @@ def test_n_critics(n_critics):
# Test SAC with different number of critics, for TD3, n_critics=1 corresponds to DDPG
model = SAC(
"MlpPolicy",
- "Pendulum-v0",
+ "Pendulum-v1",
policy_kwargs=dict(net_arch=[64, 64], n_critics=n_critics),
learning_starts=100,
buffer_size=10000,
@@ -112,7 +112,7 @@ def test_train_freq(tmp_path, train_freq):
model = SAC(
"MlpPolicy",
- "Pendulum-v0",
+ "Pendulum-v1",
policy_kwargs=dict(net_arch=[64, 64], n_critics=1),
learning_starts=100,
buffer_size=10000,
@@ -133,7 +133,7 @@ def test_train_freq_fail(train_freq):
with pytest.raises(ValueError):
model = SAC(
"MlpPolicy",
- "Pendulum-v0",
+ "Pendulum-v1",
policy_kwargs=dict(net_arch=[64, 64], n_critics=1),
learning_starts=100,
buffer_size=10000,
@@ -147,7 +147,7 @@ def test_train_freq_fail(train_freq):
def test_offpolicy_multi_env(model_class):
kwargs = {}
if model_class in [SAC, TD3, DDPG]:
- env_id = "Pendulum-v0"
+ env_id = "Pendulum-v1"
policy_kwargs = dict(net_arch=[64], n_critics=1)
# Check auto-conversion to VectorizedActionNoise
kwargs = dict(action_noise=NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)))
diff --git a/tests/test_save_load.py b/tests/test_save_load.py
index 69a3f4816..7d810c70e 100644
--- a/tests/test_save_load.py
+++ b/tests/test_save_load.py
@@ -269,7 +269,7 @@ def test_exclude_include_saved_params(tmp_path, model_class):
def test_save_load_pytorch_var(tmp_path):
- model = SAC("MlpPolicy", "Pendulum-v0", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1))
+ model = SAC("MlpPolicy", "Pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1))
model.learn(200)
save_path = str(tmp_path / "sac_pendulum")
model.save(save_path)
@@ -286,7 +286,7 @@ def test_save_load_pytorch_var(tmp_path):
assert not th.allclose(log_ent_coef_before, log_ent_coef_after)
# With a fixed entropy coef
- model = SAC("MlpPolicy", "Pendulum-v0", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1))
+ model = SAC("MlpPolicy", "Pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1))
model.learn(200)
save_path = str(tmp_path / "sac_pendulum")
model.save(save_path)
diff --git a/tests/test_sde.py b/tests/test_sde.py
index 17ac1501d..0a650a57c 100644
--- a/tests/test_sde.py
+++ b/tests/test_sde.py
@@ -65,7 +65,7 @@ def test_state_dependent_noise(model_class, use_expln):
kwargs = {"learning_starts": 0} if model_class == SAC else {"n_steps": 64}
model = model_class(
"MlpPolicy",
- "Pendulum-v0",
+ "Pendulum-v1",
use_sde=True,
seed=None,
create_eval_env=True,
diff --git a/tests/test_spaces.py b/tests/test_spaces.py
index deb09c4e4..54994b2b5 100644
--- a/tests/test_spaces.py
+++ b/tests/test_spaces.py
@@ -53,10 +53,10 @@ def test_identity_spaces(model_class, env):
@pytest.mark.parametrize("model_class", [A2C, DDPG, DQN, PPO, SAC, TD3])
-@pytest.mark.parametrize("env", ["Pendulum-v0", "CartPole-v1"])
+@pytest.mark.parametrize("env", ["Pendulum-v1", "CartPole-v1"])
def test_action_spaces(model_class, env):
if model_class in [SAC, DDPG, TD3]:
- supported_action_space = env == "Pendulum-v0"
+ supported_action_space = env == "Pendulum-v1"
elif model_class == DQN:
supported_action_space = env == "CartPole-v1"
elif model_class in [A2C, PPO]:
diff --git a/tests/test_tensorboard.py b/tests/test_tensorboard.py
index 3f755a7aa..20f58b912 100644
--- a/tests/test_tensorboard.py
+++ b/tests/test_tensorboard.py
@@ -7,8 +7,8 @@
MODEL_DICT = {
"a2c": (A2C, "CartPole-v1"),
"ppo": (PPO, "CartPole-v1"),
- "sac": (SAC, "Pendulum-v0"),
- "td3": (TD3, "Pendulum-v0"),
+ "sac": (SAC, "Pendulum-v1"),
+ "td3": (TD3, "Pendulum-v1"),
}
N_STEPS = 100
diff --git a/tests/test_train_eval_mode.py b/tests/test_train_eval_mode.py
index c5eb283b7..1ea2efe67 100644
--- a/tests/test_train_eval_mode.py
+++ b/tests/test_train_eval_mode.py
@@ -172,7 +172,7 @@ def test_dqn_train_with_batch_norm():
def test_td3_train_with_batch_norm():
model = TD3(
"MlpPolicy",
- "Pendulum-v0",
+ "Pendulum-v1",
policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor),
learning_starts=0,
tau=0, # do not copy the target
@@ -219,7 +219,7 @@ def test_td3_train_with_batch_norm():
def test_sac_train_with_batch_norm():
model = SAC(
"MlpPolicy",
- "Pendulum-v0",
+ "Pendulum-v1",
policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor),
learning_starts=0,
tau=0, # do not copy the target
@@ -257,7 +257,7 @@ def test_sac_train_with_batch_norm():
@pytest.mark.parametrize("model_class", [A2C, PPO])
-@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"])
+@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"])
def test_a2c_ppo_train_with_batch_norm(model_class, env_id):
model = model_class(
"MlpPolicy",
@@ -281,7 +281,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class):
if model_class in [DQN]:
env_id = "CartPole-v1"
else:
- env_id = "Pendulum-v0"
+ env_id = "Pendulum-v1"
clone_helper = CLONE_HELPERS[model_class]
@@ -308,7 +308,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class):
@pytest.mark.parametrize("model_class", [A2C, PPO])
-@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"])
+@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"])
def test_a2c_ppo_collect_rollouts_with_batch_norm(model_class, env_id):
model = model_class(
"MlpPolicy",
@@ -332,7 +332,7 @@ def test_a2c_ppo_collect_rollouts_with_batch_norm(model_class, env_id):
@pytest.mark.parametrize("model_class", MODEL_LIST)
-@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"])
+@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"])
def test_predict_with_dropout_batch_norm(model_class, env_id):
if env_id == "CartPole-v1":
if model_class in [SAC, TD3]:
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ea497140e..b07bbe931 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -43,7 +43,6 @@ def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class):
@pytest.mark.parametrize("n_envs", [1, 2])
@pytest.mark.parametrize("wrapper_kwargs", [None, dict(clip_reward=False, screen_size=60)])
def test_make_atari_env(env_id, n_envs, wrapper_kwargs):
- env_id = "BreakoutNoFrameskip-v4"
env = make_atari_env(env_id, n_envs, wrapper_kwargs=wrapper_kwargs, monitor_dir=None, seed=0)
assert env.num_envs == n_envs
@@ -97,7 +96,7 @@ def test_vec_env_monitor_kwargs():
def test_env_auto_monitor_wrap():
- env = gym.make("Pendulum-v0")
+ env = gym.make("Pendulum-v1")
model = A2C("MlpPolicy", env)
assert model.env.env_is_wrapped(Monitor)[0] is True
@@ -105,7 +104,7 @@ def test_env_auto_monitor_wrap():
model = A2C("MlpPolicy", env)
assert model.env.env_is_wrapped(Monitor)[0] is True
- model = A2C("MlpPolicy", "Pendulum-v0")
+ model = A2C("MlpPolicy", "Pendulum-v1")
assert model.env.env_is_wrapped(Monitor)[0] is True
@@ -137,7 +136,7 @@ def test_custom_vec_env(tmp_path):
def test_evaluate_policy():
- model = A2C("MlpPolicy", "Pendulum-v0", seed=0)
+ model = A2C("MlpPolicy", "Pendulum-v1", seed=0)
n_steps_per_episode, n_eval_episodes = 200, 2
model.n_callback_calls = 0
@@ -167,7 +166,7 @@ def dummy_callback(locals_, _globals):
assert len(episode_rewards) == n_eval_episodes
# Test that warning is given about no monitor
- eval_env = gym.make("Pendulum-v0")
+ eval_env = gym.make("Pendulum-v1")
with pytest.warns(UserWarning):
_ = evaluate_policy(model, eval_env, n_eval_episodes)
@@ -356,7 +355,7 @@ def test_zip_strict():
def test_is_wrapped():
"""Test that is_wrapped correctly detects wraps"""
- env = gym.make("Pendulum-v0")
+ env = gym.make("Pendulum-v1")
env = gym.Wrapper(env)
assert not is_wrapped(env, Monitor)
monitor_env = Monitor(env)
@@ -373,11 +372,11 @@ def test_ppo_warnings():
# Only 1 step: advantage normalization will return NaN
with pytest.raises(AssertionError):
- PPO("MlpPolicy", "Pendulum-v0", n_steps=1)
+ PPO("MlpPolicy", "Pendulum-v1", n_steps=1)
# Truncated mini-batch
with pytest.warns(UserWarning):
- PPO("MlpPolicy", "Pendulum-v0", n_steps=6, batch_size=8)
+ PPO("MlpPolicy", "Pendulum-v1", n_steps=6, batch_size=8)
def test_get_system_info():
diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
index 0136c2657..c3d1d3065 100644
--- a/tests/test_vec_normalize.py
+++ b/tests/test_vec_normalize.py
@@ -16,7 +16,7 @@
unwrap_vec_normalize,
)
-ENV_ID = "Pendulum-v0"
+ENV_ID = "Pendulum-v1"
class DummyRewardEnv(gym.Env):