openai · XuehaiPan · Feb 11, 2022 · Feb 11, 2022 · Feb 11, 2022 · Feb 11, 2022
diff --git a/gym/core.py b/gym/core.py
@@ -49,6 +49,11 @@ class Env(Generic[ObsType, ActType]):
     # Created
     _np_random: RandomNumberGenerator | None = None
 
+    # Done type enumeration
+    NOT_DONE = 0  # the episode is not done yet
+    TERMINATED = DONE = 1  # the episode is terminated, no more further step() calls
+    TRUNCATED = 2  # the episode is truncated by wrappers or step counting, no more further step() calls
+
     @property
     def np_random(self) -> RandomNumberGenerator:
         """Initializes the np_random field if not done already."""
@@ -61,7 +66,7 @@ def np_random(self, value: RandomNumberGenerator):
         self._np_random = value
 
     @abstractmethod
-    def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
+    def step(self, action: ActType) -> Tuple[ObsType, float, int, dict]:
         """Run one timestep of the environment's dynamics. When end of
         episode is reached, you are responsible for calling `reset()`
         to reset this environment's state.
@@ -74,7 +79,7 @@ def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
         Returns:
             observation (object): agent's observation of the current environment
             reward (float) : amount of reward returned after previous action
-            done (bool): whether the episode has ended, in which case further step() calls will return undefined results
+            done (int): whether the episode has ended, in which case further step() calls will return undefined results
             info (dict): contains auxiliary diagnostic information (helpful for debugging, logging, and sometimes learning)
         """
         raise NotImplementedError
@@ -276,7 +281,7 @@ def metadata(self) -> dict:
     def metadata(self, value):
         self._metadata = value
 
-    def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
+    def step(self, action: ActType) -> Tuple[ObsType, float, int, dict]:
         return self.env.step(action)
 
     def reset(self, **kwargs) -> Union[ObsType, tuple[ObsType, dict]]:

diff --git a/gym/envs/box2d/bipedal_walker.py b/gym/envs/box2d/bipedal_walker.py
@@ -527,12 +527,12 @@ def step(self, action):
             reward -= 0.00035 * MOTORS_TORQUE * np.clip(np.abs(a), 0, 1)
             # normalized to about -50.0 using heuristic, more optimal agent should spend less
 
-        done = False
+        done = self.NOT_DONE
         if self.game_over or pos[0] < 0:
             reward = -100
-            done = True
+            done = self.TERMINATED
         if pos[0] > (TERRAIN_LENGTH - TERRAIN_GRASS) * TERRAIN_STEP:
-            done = True
+            done = self.TERMINATED
         return np.array(state, dtype=np.float32), reward, done, {}
 
     def render(self, mode="human"):

diff --git a/gym/envs/box2d/car_racing.py b/gym/envs/box2d/car_racing.py
@@ -421,7 +421,7 @@ def step(self, action):
         self.state = self.render("state_pixels")
 
         step_reward = 0
-        done = False
+        done = self.NOT_DONE
         if action is not None:  # First step without action, called from reset()
             self.reward -= 0.1
             # We actually don't want to count fuel spent, we want car to be faster.
@@ -430,10 +430,10 @@ def step(self, action):
             step_reward = self.reward - self.prev_reward
             self.prev_reward = self.reward
             if self.tile_visited_count == len(self.track) or self.new_lap:
-                done = True
+                done = self.TERMINATED
             x, y = self.car.hull.position
             if abs(x) > PLAYFIELD or abs(y) > PLAYFIELD:
-                done = True
+                done = self.TERMINATED
                 step_reward = -100
 
         return self.state, step_reward, done, {}

diff --git a/gym/envs/box2d/lunar_lander.py b/gym/envs/box2d/lunar_lander.py
@@ -435,12 +435,12 @@ def step(self, action):
         )  # less fuel spent is better, about -30 for heuristic landing
         reward -= s_power * 0.03
 
-        done = False
+        done = self.NOT_DONE
         if self.game_over or abs(state[0]) >= 1.0:
-            done = True
+            done = self.TERMINATED
             reward = -100
         if not self.lander.awake:
-            done = True
+            done = self.TERMINATED
             reward = +100
         return np.array(state, dtype=np.float32), reward, done, {}
 

diff --git a/gym/envs/classic_control/acrobot.py b/gym/envs/classic_control/acrobot.py
@@ -208,9 +208,10 @@ def step(self, a):
         ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
         ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
         self.state = ns
-        terminal = self._terminal()
-        reward = -1.0 if not terminal else 0.0
-        return (self._get_ob(), reward, terminal, {})
+        terminated = self._terminal()
+        done = self.TERMINATED if terminated else self.NOT_DONE
+        reward = -1.0 if not terminated else 0.0
+        return (self._get_ob(), reward, done, {})
 
     def _get_ob(self):
         s = self.state

diff --git a/gym/envs/classic_control/cartpole.py b/gym/envs/classic_control/cartpole.py
@@ -145,14 +145,15 @@ def step(self, action):
 
         self.state = (x, x_dot, theta, theta_dot)
 
-        done = bool(
+        terminated = bool(
             x < -self.x_threshold
             or x > self.x_threshold
             or theta < -self.theta_threshold_radians
             or theta > self.theta_threshold_radians
         )
+        done = self.TERMINATED if terminated else self.NOT_DONE
 
-        if not done:
+        if not terminated:
             reward = 1.0
         elif self.steps_beyond_done is None:
             # Pole just fell!

diff --git a/gym/envs/classic_control/continuous_mountain_car.py b/gym/envs/classic_control/continuous_mountain_car.py
@@ -147,10 +147,13 @@ def step(self, action):
             velocity = 0
 
         # Convert a possible numpy bool to a Python bool.
-        done = bool(position >= self.goal_position and velocity >= self.goal_velocity)
+        terminated = bool(
+            position >= self.goal_position and velocity >= self.goal_velocity
+        )
+        done = self.TERMINATED if terminated else self.NOT_DONE
 
         reward = 0
-        if done:
+        if terminated:
             reward = 100.0
         reward -= math.pow(action[0], 2) * 0.1
 

diff --git a/gym/envs/classic_control/mountain_car.py b/gym/envs/classic_control/mountain_car.py
@@ -127,7 +127,10 @@ def step(self, action):
         if position == self.min_position and velocity < 0:
             velocity = 0
 
-        done = bool(position >= self.goal_position and velocity >= self.goal_velocity)
+        terminated = bool(
+            position >= self.goal_position and velocity >= self.goal_velocity
+        )
+        done = self.TERMINATED if terminated else self.NOT_DONE
         reward = -1.0
 
         self.state = (position, velocity)

diff --git a/gym/envs/classic_control/pendulum.py b/gym/envs/classic_control/pendulum.py
@@ -120,7 +120,7 @@ def step(self, u):
         newth = th + newthdot * dt
 
         self.state = np.array([newth, newthdot])
-        return self._get_obs(), -costs, False, {}
+        return self._get_obs(), -costs, self.NOT_DONE, {}
 
     def reset(
         self,

diff --git a/gym/envs/mujoco/ant.py b/gym/envs/mujoco/ant.py
@@ -21,7 +21,7 @@ def step(self, a):
         reward = forward_reward - ctrl_cost - contact_cost + survive_reward
         state = self.state_vector()
         notdone = np.isfinite(state).all() and state[2] >= 0.2 and state[2] <= 1.0
-        done = not notdone
+        done = self.NOT_DONE if notdone else self.TERMINATED
         ob = self._get_obs()
         return (
             ob,

diff --git a/gym/envs/mujoco/ant_v3.py b/gym/envs/mujoco/ant_v3.py
@@ -256,7 +256,7 @@ def step(self, action):
         costs = ctrl_cost + contact_cost
 
         reward = rewards - costs
-        done = self.done
+        done = self.TERMINATED if self.done else self.NOT_DONE
         observation = self._get_obs()
         info = {
             "reward_forward": forward_reward,

diff --git a/gym/envs/mujoco/half_cheetah.py b/gym/envs/mujoco/half_cheetah.py
@@ -16,7 +16,7 @@ def step(self, action):
         reward_ctrl = -0.1 * np.square(action).sum()
         reward_run = (xposafter - xposbefore) / self.dt
         reward = reward_ctrl + reward_run
-        done = False
+        done = self.NOT_DONE
         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
 
     def _get_obs(self):

diff --git a/gym/envs/mujoco/half_cheetah_v3.py b/gym/envs/mujoco/half_cheetah_v3.py
@@ -167,7 +167,7 @@ def step(self, action):
 
         observation = self._get_obs()
         reward = forward_reward - ctrl_cost
-        done = False
+        done = self.NOT_DONE
         info = {
             "x_position": x_position_after,
             "x_velocity": x_velocity,

diff --git a/gym/envs/mujoco/hopper.py b/gym/envs/mujoco/hopper.py
@@ -17,12 +17,13 @@ def step(self, a):
         reward += alive_bonus
         reward -= 1e-3 * np.square(a).sum()
         s = self.state_vector()
-        done = not (
+        terminated = not (
             np.isfinite(s).all()
             and (np.abs(s[2:]) < 100).all()
             and (height > 0.7)
             and (abs(ang) < 0.2)
         )
+        done = self.TERMINATED if terminated else self.NOT_DONE
         ob = self._get_obs()
         return ob, reward, done, {}
 

diff --git a/gym/envs/mujoco/hopper_v3.py b/gym/envs/mujoco/hopper_v3.py
@@ -231,7 +231,7 @@ def step(self, action):
 
         observation = self._get_obs()
         reward = rewards - costs
-        done = self.done
+        done = self.TERMINATED if self.done else self.NOT_DONE
         info = {
             "x_position": x_position_after,
             "x_velocity": x_velocity,

diff --git a/gym/envs/mujoco/humanoid.py b/gym/envs/mujoco/humanoid.py
@@ -39,7 +39,8 @@ def step(self, a):
         quad_impact_cost = min(quad_impact_cost, 10)
         reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
         qpos = self.sim.data.qpos
-        done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
+        terminated = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
+        done = self.TERMINATED if terminated else self.NOT_DONE
         return (
             self._get_obs(),
             reward,

diff --git a/gym/envs/mujoco/humanoid_v3.py b/gym/envs/mujoco/humanoid_v3.py
@@ -315,7 +315,7 @@ def step(self, action):
 
         observation = self._get_obs()
         reward = rewards - costs
-        done = self.done
+        done = self.TERMINATED if self.done else self.NOT_DONE
         info = {
             "reward_linvel": forward_reward,
             "reward_quadctrl": -ctrl_cost,

diff --git a/gym/envs/mujoco/humanoidstandup.py b/gym/envs/mujoco/humanoidstandup.py
@@ -202,7 +202,7 @@ def step(self, a):
         quad_impact_cost = min(quad_impact_cost, 10)
         reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1
 
-        done = bool(False)
+        done = self.NOT_DONE
         return (
             self._get_obs(),
             reward,

diff --git a/gym/envs/mujoco/inverted_double_pendulum.py b/gym/envs/mujoco/inverted_double_pendulum.py
@@ -122,7 +122,8 @@ def step(self, action):
         vel_penalty = 1e-3 * v1 ** 2 + 5e-3 * v2 ** 2
         alive_bonus = 10
         r = alive_bonus - dist_penalty - vel_penalty
-        done = bool(y <= 1)
+        terminated = bool(y <= 1)
+        done = self.TERMINATED if terminated else self.NOT_DONE
         return ob, r, done, {}
 
     def _get_obs(self):

diff --git a/gym/envs/mujoco/inverted_pendulum.py b/gym/envs/mujoco/inverted_pendulum.py
@@ -89,7 +89,7 @@ def step(self, a):
         self.do_simulation(a, self.frame_skip)
         ob = self._get_obs()
         notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= 0.2)
-        done = not notdone
+        done = self.NOT_DONE if notdone else self.TERMINATED
         return ob, reward, done, {}
 
     def reset_model(self):

diff --git a/gym/envs/mujoco/pusher.py b/gym/envs/mujoco/pusher.py
@@ -143,7 +143,7 @@ def step(self, a):
 
         self.do_simulation(a, self.frame_skip)
         ob = self._get_obs()
-        done = False
+        done = self.NOT_DONE
         return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
 
     def viewer_setup(self):

diff --git a/gym/envs/mujoco/reacher.py b/gym/envs/mujoco/reacher.py
@@ -127,7 +127,7 @@ def step(self, a):
         reward = reward_dist + reward_ctrl
         self.do_simulation(a, self.frame_skip)
         ob = self._get_obs()
-        done = False
+        done = self.NOT_DONE
         return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
 
     def viewer_setup(self):

diff --git a/gym/envs/mujoco/swimmer_v3.py b/gym/envs/mujoco/swimmer_v3.py
@@ -161,7 +161,7 @@ def step(self, action):
 
         observation = self._get_obs()
         reward = forward_reward - ctrl_cost
-        done = False
+        done = self.NOT_DONE
         info = {
             "reward_fwd": forward_reward,
             "reward_ctrl": -ctrl_cost,

diff --git a/gym/envs/mujoco/walker2d.py b/gym/envs/mujoco/walker2d.py
@@ -16,7 +16,8 @@ def step(self, a):
         reward = (posafter - posbefore) / self.dt
         reward += alive_bonus
         reward -= 1e-3 * np.square(a).sum()
-        done = not (height > 0.8 and height < 2.0 and ang > -1.0 and ang < 1.0)
+        terminated = not (height > 0.8 and height < 2.0 and ang > -1.0 and ang < 1.0)
+        done = self.TERMINATED if terminated else self.NOT_DONE
         ob = self._get_obs()
         return ob, reward, done, {}
 

diff --git a/gym/envs/mujoco/walker2d_v3.py b/gym/envs/mujoco/walker2d_v3.py
@@ -229,7 +229,7 @@ def step(self, action):
 
         observation = self._get_obs()
         reward = rewards - costs
-        done = self.done
+        done = self.TERMINATED if self.done else self.NOT_DONE
         info = {
             "x_position": x_position_after,
             "x_velocity": x_velocity,

diff --git a/gym/envs/toy_text/blackjack.py b/gym/envs/toy_text/blackjack.py
@@ -125,13 +125,13 @@ def step(self, action):
         if action:  # hit: add a card to players hand and return
             self.player.append(draw_card(self.np_random))
             if is_bust(self.player):
-                done = True
+                done = self.TERMINATED
                 reward = -1.0
             else:
-                done = False
+                done = self.NOT_DONE
                 reward = 0.0
         else:  # stick: play out the dealers hand, and score
-            done = True
+            done = self.TERMINATED
             while sum_hand(self.dealer) < 17:
                 self.dealer.append(draw_card(self.np_random))
             reward = cmp(score(self.player), score(self.dealer))
@@ -275,4 +275,5 @@ def scale_card_img(card_img):
                 np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
             )
 
+
 # Pixel art from Mariia Khmelnytska (https://www.123rf.com/photo_104453049_stock-vector-pixel-art-playing-cards-standart-deck-vector-set.html)
diff --git a/gym/envs/toy_text/cliffwalking.py b/gym/envs/toy_text/cliffwalking.py
@@ -123,7 +123,8 @@ def step(self, a):
         p, s, r, d = transitions[i]
         self.s = s
         self.lastaction = a
-        return (int(s), r, d, {"prob": p})
+        done = self.TERMINATED if d else self.NOT_DONE
+        return (int(s), r, done, {"prob": p})
 
     def reset(
         self,

diff --git a/gym/envs/toy_text/frozen_lake.py b/gym/envs/toy_text/frozen_lake.py
@@ -216,7 +216,8 @@ def step(self, a):
         p, s, r, d = transitions[i]
         self.s = s
         self.lastaction = a
-        return (int(s), r, d, {"prob": p})
+        done = self.TERMINATED if d else self.NOT_DONE
+        return (int(s), r, done, {"prob": p})
 
     def reset(
         self,

diff --git a/gym/envs/toy_text/taxi.py b/gym/envs/toy_text/taxi.py
@@ -197,7 +197,8 @@ def step(self, a):
         p, s, r, d = transitions[i]
         self.s = s
         self.lastaction = a
-        return (int(s), r, d, {"prob": p})
+        done = self.TERMINATED if d else self.NOT_DONE
+        return (int(s), r, done, {"prob": p})
 
     def reset(
         self,

diff --git a/gym/utils/env_checker.py b/gym/utils/env_checker.py
@@ -214,7 +214,14 @@ def _check_returned_values(
     assert isinstance(
         reward, (float, int, np.float32)
     ), "The reward returned by `step()` must be a float"
-    assert isinstance(done, bool), "The `done` signal must be a boolean"
+    assert isinstance(done, int) and not isinstance(
+        done, bool
+    ), "The `done` signal must be an integer"
+    assert done in (
+        env.NOT_DONE,
+        env.TERMINATED,
+        env.TRUNCATED,
+    ), f"The `done` signal must be in {(env.NOT_DONE, env.TERMINATED, env.TRUNCATED)}"
     assert isinstance(
         info, dict
     ), "The `info` returned by `step()` must be a python dictionary"